本文整理汇总了Python中theano.misc.strutil.render_string函数的典型用法代码示例。如果您正苦于以下问题:Python render_string函数的具体用法?Python render_string怎么用?Python render_string使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了render_string函数的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: c_code
#.........这里部分代码省略.........
}
{ // extra scope so fail works
//Read and check stride arguments
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError,"ConvGrad3D: Strides should all be positive but they are %%i, %%i, %%i",dr,dc,dt);
%(fail)s
}
{ // extra scope so fail works
//Compute correct sized of output
const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1;
if (PyArray_DIMS(%(dCdH)s)[0] != batchSize ||
PyArray_DIMS(%(dCdH)s)[4] != outputChannels ||
PyArray_DIMS(%(dCdH)s)[1] != outputHeight ||
PyArray_DIMS(%(dCdH)s)[2] != outputWidth ||
PyArray_DIMS(%(dCdH)s)[3] != outputDur)
{
PyErr_Format(PyExc_ValueError, "dCdH is the wrong size, expected (%%i,%%i,%%i,%%i,%%i), got (%%li,%%li,%%li,%%li,%%li)", batchSize, outputHeight, outputWidth, outputDur, outputChannels, (long)PyArray_DIMS(%(dCdH)s)[0], (long)PyArray_DIMS(%(dCdH)s)[1], (long)PyArray_DIMS(%(dCdH)s)[2], (long)PyArray_DIMS(%(dCdH)s)[3], (long)PyArray_DIMS(%(dCdH)s)[4]);
%(fail)s
}
{ // extra scope for fail
npy_intp dims[5];
dims[0] = outputChannels;
dims[4] = inputChannels;
dims[1] = filterHeight;
dims[2] = filterWidth;
dims[3] = filterDur;
if(!(%(dCdW)s) || PyArray_DIMS(%(dCdW)s)[0]!=dims[0] ||
PyArray_DIMS(%(dCdW)s)[1]!=dims[1] ||
PyArray_DIMS(%(dCdW)s)[2]!=dims[2] ||
PyArray_DIMS(%(dCdW)s)[3]!=dims[3] ||
PyArray_DIMS(%(dCdW)s)[4]!=dims[4] ){
Py_XDECREF(%(dCdW)s);
%(dCdW)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, PyArray_DESCR(%(V)s)->type_num);
if (!(%(dCdW)s)) {
PyErr_Format(PyExc_MemoryError,"ConvGrad3D: Could not allocate dCdW");
%(fail)s
}
}
{ //extra scope so fail works
#define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i)*PyArray_STRIDES(x)[0]+(j)*PyArray_STRIDES(x)[1]+(k)*PyArray_STRIDES(x)[2]+(l)*PyArray_STRIDES(x)[3]+(m)*PyArray_STRIDES(x)[4] )
#define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) )
const int dhs3 = PyArray_STRIDES(%(dCdH)s)[3];
const int dtvs3 = dt * PyArray_STRIDES(%(V)s)[3];
// Compute dCdW
//TODO-- see if this can be made faster by using ELEM_AT instead of ELEM5
// dCdW[j,k,l,m,z] = sum_i sum_p sum_q sum_r dCdH[i,p,q,r,j] * V[i,dr*p+k,dc*q+l,dt*r+m,z]
for (int j = 0; j < outputChannels; j++) {
for (int z = 0; z < inputChannels; z++) {
for (int k = 0; k < filterHeight; k++) {
for (int l = 0; l < filterWidth; l++) {
for (int m = 0; m < filterDur; m++) {
//printf("writePos %%i %%i %%i %%i %%i \\n",j,k,l,m,z);
dtype_%(dCdW)s & writePos = ELEM5(%(dCdW)s, j,k,l,m,z);
writePos = 0;
for (int i = 0; i < batchSize; i++) {
for (int p = 0; p < outputHeight; p++) {
for (int q = 0; q < outputWidth; q++) {
int Hpos = i * PyArray_STRIDES(%(dCdH)s)[0] + j * PyArray_STRIDES(%(dCdH)s)[4] + p * PyArray_STRIDES(%(dCdH)s)[1] + q * PyArray_STRIDES(%(dCdH)s)[2] ;
int Vpos = i * PyArray_STRIDES(%(V)s)[0] + z * PyArray_STRIDES(%(V)s)[4] + (dr * p+k) * PyArray_STRIDES(%(V)s)[1] + (dc*q+l) * PyArray_STRIDES(%(V)s)[2] + m * PyArray_STRIDES(%(V)s)[3];
for (int r = 0; r < outputDur; r++) {
writePos += ELEM5(%(dCdH)s,i,p,q,r,j) * ELEM5(%(V)s,i,dr*p+k,dc*q+l,dt*r+m,z);
//writePos += ELEM_AT(%(dCdH)s,Hpos) * ELEM_AT(%(V)s,Vpos);
Hpos += dhs3;
Vpos += dtvs3;
}
}
}
}
}
}
}
}
}
}}}}}}} // extra scope for fail
///////////// < /code generated by ConvGradW3D >
"""
return strutil.render_string(codeSource, locals())
示例2: c_code
#.........这里部分代码省略.........
codeSource += """
{
//General case code
//std::cout << "general case code" << std::endl;
long long Hpos = 0;
long long Vpos = 0;
for (int i = 0; i < batchSize; i++) {
long long Hposi = Hpos;
long long Vposi = Vpos;
for (int r = 0; r < outputHeight; r++) {
long long Hposr = Hpos;
long long Vposr = Vpos;
for (int c = 0; c < outputWidth; c++) {
long long Hposc = Hpos;
long long Vposc = Vpos;
for (int t = 0; t < outputDur; t++) {
long long Hpost = Hpos;
long long Vpost = Vpos;
//of the loops so far, j should be the innermost, because
//each loop through j visits the same elements of V
//this implies that the last index of H should be the j index
//since V and H should have the same format, this means
//z should be the last index in v, and therefore the innermost
//of the next set of for loops
int Wpos = 0;
int bPos = 0;
for (int j = 0; j < outputChannels; j++) {
long long Hposj = Hpos;
long long Vposj = Vpos;
int Wposj = Wpos;
// H[i,r,c,t,j] = b[j]
dtype_%(H)s & writePos = ELEM_AT(%(H)s,Hpos);
writePos = ELEM_AT(%(b)s,bPos);
for (int k =0; k < filterHeight; k++) {
int Wposk = Wpos;
long long Vposk = Vpos;
for (int l = 0; l < filterWidth; l++) {
int Wposl = Wpos;
long long Vposl = Vpos;
for (int m = 0; m < filterDur; m++) {
int Wposm = Wpos;
long long Vposm = Vpos;
for (int z = 0; z < inputChannels; z++) {
//H[i,r,c,t,j] += W[j,z,k,l,m] * V[i,dr*r+k, dc*c+l, dt*t+m,z]
writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(V)s,Vpos);
Wpos += ws4;
Vpos += vs4;
} // close z
Wpos = Wposm + ws3;
Vpos = Vposm + vs3;
} // close m
Wpos = Wposl + ws2;
Vpos = Vposl + vs2;
} //close l
Wpos = Wposk + PyArray_STRIDES(%(W)s)[1];
Vpos = Vposk + PyArray_STRIDES(%(V)s)[1];
} //close k
bPos += bs;
Wpos = Wposj + ws0;
Hpos = Hposj + hs4;
Vpos = Vposj;
//std::cout << "incremented Wpos by " << ws0 << std::endl;
//std::cout << "incremented Hpos by " << hs4 << std::endl;
} //close j
Hpos = Hpost + PyArray_STRIDES(%(H)s)[3];
Vpos = Vpost + vs3 * dt;
} //close t
Hpos = Hposc + PyArray_STRIDES(%(H)s)[2];
Vpos = Vposc + vs2 * dc;
} //close c
Hpos = Hposr + PyArray_STRIDES(%(H)s)[1];
Vpos = Vposr + PyArray_STRIDES(%(V)s)[1] * dr;
} //closes r
Hpos = Hposi + PyArray_STRIDES(%(H)s)[0];
Vpos = Vposi + PyArray_STRIDES(%(V)s)[0];
} //closes i
} //closes general case code
}}}}}}} //extra scope so error handler jumps don't cross declarations
///////////// < /code generated by Conv3D >
"""
return strutil.render_string(codeSource,locals())
示例3: c_code
#.........这里部分代码省略.........
videoWidth = RShape1;
videoDur = RShape2;
}
}
//Allocate the reconstruction
npy_intp dims[5];
dims[0] = batchSize;
dims[4] = inputChannels;
dims[1] = videoHeight;
dims[2] = videoWidth;
dims[3] = videoDur;
if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){
Py_XDECREF(%(R)s);
%(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(R)s)) {
PyErr_Format(PyExc_MemoryError,"Could not allocate R");
%(fail)s;
}
}
cudaMemset(CudaNdarray_DEV_DATA(%(R)s), 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur);
{ // for fail
bool out_contiguous = CudaNdarray_is_c_contiguous(%(R)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
printf("b stride0=%%d\\n",CudaNdarray_HOST_STRIDES(%(b)s)[0]);
bool work_complete = false;
const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
const int hs4 = CudaNdarray_HOST_STRIDES(%(H)s)[4];
const int hs3 = CudaNdarray_HOST_STRIDES(%(H)s)[3];
const int hs2 = CudaNdarray_HOST_STRIDES(%(H)s)[2];
const int hs1 = CudaNdarray_HOST_STRIDES(%(H)s)[1];
const int hs0 = CudaNdarray_HOST_STRIDES(%(H)s)[0];
if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_complete){
//conv_transp_rows_stack
dim3 grid(batchSize * inputChannels, videoHeight * videoWidth);
dim3 threads(videoDur);
HERE
int shared_size=0;
conv_transp_rows_stack<<<grid, threads, shared_size>>>(
CudaNdarray_DEV_DATA(%(H)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(R)s),
videoHeight, videoWidth, videoDur,
filterHeight, filterWidth, filterDur,
outputHeight, outputWidth, outputDur,
outputChannels, inputChannels,
dr,dc,dt,
hs3,hs2,hs1,hs4,hs0,
ws3,ws2,ws1,ws4,ws0,
CudaNdarray_HOST_STRIDES(%(b)s)[0]);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_transp_rows_stack' version\\n");
}
else
{
if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("ERROR: all implementations failed for GpuConvTransp3D! (%%s)",cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)",
cudaGetErrorString(sts));
%(fail)s
}
}
if(!work_complete){
PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConvTransp3D! out_contiguous=%%d b_strided=%%d outputDur=%%d",
out_contiguous,b_strided,outputDur);
%(fail)s
}
}}}}}} // for fail
///////////// < /code generated by GpuConvTransp3D >
"""
return strutil.render_string(codeSource, locals())
示例4: c_code
#.........这里部分代码省略.........
PyArray_DIMS(%(R)s)[4]!=dims[4])
{
Py_XDECREF(%(R)s);
%(R)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, PyArray_DESCR(%(H)s)->type_num);
if (!(%(R)s)) {
PyErr_Format(PyExc_MemoryError, "ConvTransp3D: could not allocate R");
%(fail)s
}
}
{ // for fail 6
#define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i)*PyArray_STRIDES(x)[0]+(j)*PyArray_STRIDES(x)[1]+(k)*PyArray_STRIDES(x)[2]+(l)*PyArray_STRIDES(x)[3]+(m)*PyArray_STRIDES(x)[4] )
#define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) )
dtype_%(b)s * b = (dtype_%(b)s *) PyArray_DATA(%(b)s);
int rs4 = PyArray_STRIDES(%(R)s)[4];
int ws0 = PyArray_STRIDES(%(W)s)[0];
int ws4 = PyArray_STRIDES(%(W)s)[4];
int hs4 = PyArray_STRIDES(%(H)s)[4];
// Compute R
// R[i,r,c,t,j] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, rk, ck, tk,j] * H[i,rc,cc,tc,k]
for (int i = 0; i < batchSize; i++) {
for (int r = 0; r < videoHeight; r++) {
const int frc = (int)std::max(0.0f, ceilf(float(r-filterHeight+1)/float(dr)));
for (int c = 0; c < videoWidth; c++) {
const int fcc = (int)std::max(0.0f, ceilf(float(c-filterWidth +1)/float(dc)));
for (int t = 0; t < videoDur; t++) {
const int ftc = (int)std::max(0.0f, ceilf(float(t-filterDur +1) /float(dt)));
long long Rpost = i * PyArray_STRIDES(%(R)s)[0] + r * PyArray_STRIDES(%(R)s)[1] + c * PyArray_STRIDES(%(R)s)[2] + t * PyArray_STRIDES(%(R)s)[3];
long long Rpos = Rpost;
for (int j = 0; j < inputChannels; j++)
{
//ELEM5(%(R)s, i,r,c,t,j) = b[j];
ELEM_AT(%(R)s,Rpos) = b[j];
Rpos += rs4;
}
for (int rc = frc; rc < outputHeight; rc++) {
const int rk = r - rc * dr;
if (rk < 0) break;
for (int cc = fcc; cc < outputWidth; cc++) {
const int ck = c - cc * dc;
if (ck < 0) break;
for (int tc = ftc; tc < outputDur; tc++)
{
const int tk = t - tc * dt;
if (tk < 0) break;
int Wpos = rk * PyArray_STRIDES(%(W)s)[1] + ck * PyArray_STRIDES(%(W)s)[2] + tk * PyArray_STRIDES(%(W)s)[3];
int Hpostc = i * PyArray_STRIDES(%(H)s)[0] + rc * PyArray_STRIDES(%(H)s)[1] + cc * PyArray_STRIDES(%(H)s)[2] + tc * PyArray_STRIDES(%(H)s)[3];
Rpos = Rpost;
for (int j = 0; j < inputChannels; j++)
{
int Wposj = Wpos;
dtype_%(R)s & writePos = ELEM_AT(%(R)s,Rpos);
int Hpos = Hpostc;
for (int k = 0; k < outputChannels; k++) {
//TODO-- it's probably bad in terms of cache that our inner loop is over the largest stride of W.... maybe OK since it's the smallest stride of H
//writePos += ELEM5(%(W)s,k,rk,ck,tk,j) * ELEM5(%(H)s,i,rc,cc,tc,k);
//writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);
writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);
Wpos += ws0;
Hpos += hs4;
} //close the k loop
Rpos += rs4;
Wpos = Wposj + ws4;
} //close the j loop
} // close the tc loop
} //cc
} //rc
} //t
} //c
} //r
} //i
} //for fail 6
} //for fail 5
} //for fail 4
} //for fail 3
} //for fail 2
} // for fail 1
///////////// < /code generated by ConvTransp3D >
"""
return strutil.render_string(codeSource, locals())
示例5: c_code
#.........这里部分代码省略.........
setup_nv_weights_grads = """
int filters_dims[4];
// filters: (input channels, filter rows, filter cols, output channels)
filters_dims[0] = img_channels;
filters_dims[1] = imgSizeY - hidGradsSizeY + 1 - 2 * paddingStart;
filters_dims[2] = imgSizeX - hidGradsSizeX + 1 - 2 * paddingStart;
assert(filters_dims[1] == filters_dims[2]); // only square kernels are supported
filters_dims[3] = numFilters;
const int filterSize = filters_dims[1];
int partialsum_storage_dims[5];
for (int i = 1; i < 5; i++)
{
partialsum_storage_dims[i] = filters_dims[i - 1];
}
partialsum_storage_dims[0] = numModules / partialSum;
CudaNdarray *partialsum_storage = NULL;
if (partialSum != numModules &&
CudaNdarray_prep_output(&partialsum_storage, 5,
partialsum_storage_dims))
{
%(fail)s;
}
for (int i = 0; i < 4; i++)
{
if (filters_dims[i] <= 0)
{
printf("filters_dims[%%d] = %%d\\n", i, filters_dims[i]);
assert(false);
}
}
if (CudaNdarray_prep_output(& %(weights_grads)s, 4, filters_dims))
{
Py_DECREF(partialsum_storage);
%(fail)s;
}
{ // setup_nv_weights_grad brace # 1
NVMatrix nv_weights_grads(%(weights_grads)s, filters_dims[0] * filterSize * filterSize, numFilters,
"weight_acts:nv_weights_grads");
"""
num_braces += 1
# note: imgSizeX is not specified here, it is computed internally
# (in _filterActsSparse) by the lines:
# int imgPixels = images.getNumRows() / numImgColors;
# int imgSizeX = imgPixels / imgSizeY;
#
# note: numFilters is not specified here. it is determined by
# nv_filters.getNumCols()
#
# note: the size of the filters is determined by dividing
# nv_filters.getNumRows() by numFilterColors
#
run_kernel = """
if (partialSum == numModules)
_weightActs(nv_images, nv_hid_grads, nv_weights_grads,
imgSizeY, hidGradsSizeY, hidGradsSizeX, filterSize,
paddingStart, moduleStride, img_channels, numGroups,
partialSum, 0, 1);
else {
NVMatrix nv_partialsum(partialsum_storage, (numModules / partialSum) *
filters_dims[0] * filterSize * filterSize, numFilters,
"weight_acts: nv_partialsum");
_weightActs(nv_images, nv_hid_grads, nv_partialsum,
imgSizeY, hidGradsSizeY, hidGradsSizeX, filterSize,
paddingStart, moduleStride, img_channels, numGroups,
partialSum, 0, 1);
nv_partialsum.reshape((numModules / partialSum), filters_dims[0] * filterSize * filterSize * numFilters);
// sum out axis 0 of nv_partialsum
#define AXIS 0
// scale the contents of nv_weights_grads by 0
// i.e., clear out its pre-existing content
#define SCALE_THIS 0
// scale the new sum by 1, i.e., don't do any scaling
#define SCALE_SUM 1
nv_weights_grads.addSum(nv_partialsum, AXIS, SCALE_THIS, SCALE_SUM);
Py_DECREF(partialsum_storage);
}
"""
braces = '}' * num_braces
rval = (basic_setup +
setup_nv_images +
setup_nv_hid_grads +
setup_nv_weights_grads +
run_kernel +
braces)
rval = render_string(rval, locals())
return rval
示例6: c_code
#.........这里部分代码省略.........
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt);
%(fail)s
}
{ // extra scope so fail works
//Make correctly sized output
const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1;
npy_intp dims[5];
dims[0] = batchSize;
dims[4] = outputChannels;
dims[1] = outputHeight;
dims[2] = outputWidth;
dims[3] = outputDur;
if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
Py_XDECREF(%(H)s);
%(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(H)s)) {
PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output");
%(fail)s
}
}
{ // extra scope so fail will not cross declarations
//#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )####################
const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
// Compute H
//H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z]
bool out_contiguous = CudaNdarray_is_c_contiguous(%(H)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
bool work_complete = false;
if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 && !work_complete){
//conv_rows_stack
dim3 grid(outputHeight*outputWidth,batchSize*outputChannels);
dim3 threads(outputDur);
int shared_size=0;
conv_rows_stack<<<grid, threads, shared_size>>>(
CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s),
vidHeight, vidWidth, vidDur,
filterHeight, filterWidth, filterDur,
outputChannels, inputChannels,
dr,dc,dt,
vs3,vs2,vs1,vs4,vs0,
ws3,ws2,ws1,ws4,ws0);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
}
else
{
if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)",
cudaGetErrorString(sts));
%(fail)s
}
}
if(!work_complete){
PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
%(fail)s
}
}}}}}}} //extra scope so error handler jumps don't cross declarations
///////////// < /code generated by GpuConv3D >
"""
return strutil.render_string(codeSource, locals())
示例7: c_code
#.........这里部分代码省略.........
//Compute correctl sized of output
const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1;
if (CudaNdarray_HOST_DIMS(%(dCdH)s)[0] != batchSize ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[4] != outputChannels ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[1] != outputHeight ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[2] != outputWidth ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[3] != outputDur)
{
PyErr_Format(PyExc_ValueError, "dCdH is the wrong size, expected (%%i,%%i,%%i,%%i,%%i), got (%%i,%%i,%%i,%%i,%%i)", batchSize, outputHeight, outputWidth, outputDur, outputChannels, CudaNdarray_HOST_DIMS(%(dCdH)s)[0], CudaNdarray_HOST_DIMS(%(dCdH)s)[1], CudaNdarray_HOST_DIMS(%(dCdH)s)[2] ,CudaNdarray_HOST_DIMS(%(dCdH)s)[3], CudaNdarray_HOST_DIMS(%(dCdH)s)[4] );
%(fail)s
}
{ // extra scope for fail
npy_intp dims[5];
dims[0] = outputChannels;
dims[4] = inputChannels;
dims[1] = filterHeight;
dims[2] = filterWidth;
dims[3] = filterDur;
if(!(%(dCdW)s) || CudaNdarray_HOST_DIMS(%(dCdW)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[4]!=dims[4] ){
Py_XDECREF(%(dCdW)s);
%(dCdW)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(dCdW)s)) {
PyErr_Format(PyExc_MemoryError, "GpuConvGrad3D: Could not allocated dCdW");
%(fail)s
}
}
{ //for fail
const int dcdhs4 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[4];
const int dcdhs3 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[3];
const int dcdhs1 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[1];
const int dcdhs2 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[2];
const int dcdhs0 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[0];
const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
bool out_contiguous = CudaNdarray_is_c_contiguous(%(dCdW)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool work_complete = false;
if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_complete){
//conv_rows_stack
dim3 grid(WShape[0]*WShape[4],WShape[1]*WShape[2]);//outputHeight*outputWidth);
dim3 threads(WShape[3]);
int shared_size=0;
convgrad_rows_stack<<<grid, threads, shared_size>>>(
CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(dCdH)s), CudaNdarray_DEV_DATA(%(dCdW)s),
vidHeight, vidWidth, vidDur,
filterHeight, filterWidth, filterDur,
WShape[0], WShape[1], WShape[2], WShape[3], WShape[4],
outputHeight,outputWidth,outputDur,
batchSize, outputChannels, inputChannels,
dr,dc,dt,
vs3,vs2,vs1,vs4,vs0,
dcdhs3,dcdhs2,dcdhs1,dcdhs4,dcdhs0);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
}
else
{
if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvGrad3D! (%%s)",
cudaGetErrorString(sts));
%(fail)s
}
}
if(!work_complete){
PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
%(fail)s
}
}}}}} // extra scope for fail
///////////// < /code generated by GpuConvGrad3D >
"""
return strutil.render_string(codeSource, locals())
示例8: c_code
#.........这里部分代码省略.........
PyErr_Format(PyExc_ValueError,
"filter must be square, but have shape (%%d, %%d).",
filters_dims[1], filters_dims[2]);
%(fail)s;
}
else if (moduleStride > filters_dims[1]) {
PyErr_Format(PyExc_ValueError,
"stride %%d greater than filter size (%%d, %%d)",
moduleStride, filters_dims[1], filters_dims[2]);
%(fail)s;
}
filters_dims[3] = numFilters;
const int filterSize = filters_dims[1];
int partialsum_storage_dims[5];
for (int i = 1; i < 5; i++)
{
partialsum_storage_dims[i] = filters_dims[i - 1];
}
partialsum_storage_dims[0] = numModules / partialSum;
if (partialSum != numModules &&
CudaNdarray_prep_output(&%(partialsum_storage)s, 5,
partialsum_storage_dims))
{
%(fail)s;
}
for (int i = 0; i < 4; i++)
{
if (filters_dims[i] <= 0)
{
printf("filters_dims[%%d] = %%d\\n", i, filters_dims[i]);
assert(false);
}
}
if (CudaNdarray_prep_output(& %(weights_grads)s, 4, filters_dims))
{
%(fail)s;
}
{ // setup_nv_weights_grad brace # 1
NVMatrix nv_weights_grads(%(weights_grads)s, filters_dims[0] * filterSize
* filterSize, numFilters,
"weight_acts:nv_weights_grads");
"""
num_braces += 1
# note: imgSizeX is not specified here, it is computed internally
# (in _filterActsSparse) by the lines:
# int imgPixels = images.getNumRows() / numImgColors;
# int imgSizeX = imgPixels / imgSizeY;
#
# note: numFilters is not specified here. it is determined by
# nv_filters.getNumCols()
#
# note: the size of the filters is determined by dividing
# nv_filters.getNumRows() by numFilterColors
#
run_kernel = """
if (partialSum == numModules)
_weightActs(nv_images, nv_hid_grads, nv_weights_grads,
imgSizeY, hidGradsSizeY, hidGradsSizeX, filterSize,
paddingStart, moduleStride, img_channels, numGroups,
partialSum, 0, 1);
else {
NVMatrix nv_partialsum(%(partialsum_storage)s, (numModules / partialSum) *
filters_dims[0] * filterSize * filterSize, numFilters,
"weight_acts: nv_partialsum");
_weightActs(nv_images, nv_hid_grads, nv_partialsum,
imgSizeY, hidGradsSizeY, hidGradsSizeX, filterSize,
paddingStart, moduleStride, img_channels, numGroups,
partialSum, 0, 1);
nv_partialsum.reshape((numModules / partialSum), filters_dims[0] * filterSize * filterSize * numFilters);
// sum out axis 0 of nv_partialsum
#define AXIS 0
// scale the contents of nv_weights_grads by 0
// i.e., clear out its pre-existing content
#define SCALE_THIS 0
// scale the new sum by 1, i.e., don't do any scaling
#define SCALE_SUM 1
nv_weights_grads.addSum(nv_partialsum, AXIS, SCALE_THIS, SCALE_SUM);
}
"""
braces = '}' * num_braces
rval = (basic_setup +
setup_nv_images +
setup_nv_hid_grads +
setup_nv_weights_grads +
run_kernel +
braces)
rval = render_string(rval, locals())
return rval