本文整理汇总了C++中Matrix2D::GetFlatData方法的典型用法代码示例。如果您正苦于以下问题:C++ Matrix2D::GetFlatData方法的具体用法?C++ Matrix2D::GetFlatData怎么用?C++ Matrix2D::GetFlatData使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Matrix2D
的用法示例。
在下文中一共展示了Matrix2D::GetFlatData方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: sizeof
template <class T> void
MICStencil<T>::operator()( Matrix2D<T>& mtx, unsigned int nIters )
{
unsigned int uDimWithHalo = mtx.GetNumRows();
unsigned int uHaloWidth = LINESIZE / sizeof(T);
unsigned int uImgElements = uDimWithHalo * uDimWithHalo;
__declspec(target(mic), align(LINESIZE)) T* pIn = mtx.GetFlatData();
__declspec(target(mic), align(sizeof(T))) T wcenter = this->wCenter;
__declspec(target(mic), align(sizeof(T))) T wdiag = this->wDiagonal;
__declspec(target(mic), align(sizeof(T))) T wcardinal = this->wCardinal;
#pragma offload target(mic) in(pIn:length(uImgElements) ALLOC RETAIN)
{
// Just copy pIn to compute the copy transfer time
}
#pragma offload target(mic) in(pIn:length(uImgElements) REUSE RETAIN) \
in(uImgElements) in(uDimWithHalo) \
in(wcenter) in(wdiag) in(wcardinal)
{
unsigned int uRowPartitions = sysconf(_SC_NPROCESSORS_ONLN) / 4 - 1;
unsigned int uColPartitions = 4; // Threads per core for KNC
unsigned int uRowTileSize = (uDimWithHalo - 2 * uHaloWidth) / uRowPartitions;
unsigned int uColTileSize = (uDimWithHalo - 2 * uHaloWidth) / uColPartitions;
uRowTileSize = ((uDimWithHalo - 2 * uHaloWidth) % uRowPartitions > 0) ? (uRowTileSize + 1) : (uRowTileSize);
// Should use the "Halo Val" when filling the memory space
T *pTmp = (T*)pIn;
T *pCrnt = (T*)memset((T*)_mm_malloc(uImgElements * sizeof(T), LINESIZE), 0, uImgElements * sizeof(T));
#pragma omp parallel firstprivate(pTmp, pCrnt, uRowTileSize, uColTileSize, uHaloWidth, uDimWithHalo)
{
unsigned int uThreadId = omp_get_thread_num();
unsigned int uRowTileId = uThreadId / uColPartitions;
unsigned int uColTileId = uThreadId % uColPartitions;
unsigned int uStartLine = uRowTileId * uRowTileSize + uHaloWidth;
unsigned int uStartCol = uColTileId * uColTileSize + uHaloWidth;
unsigned int uEndLine = uStartLine + uRowTileSize;
uEndLine = (uEndLine > (uDimWithHalo - uHaloWidth)) ? uDimWithHalo - uHaloWidth : uEndLine;
unsigned int uEndCol = uStartCol + uColTileSize;
uEndCol = (uEndCol > (uDimWithHalo - uHaloWidth)) ? uDimWithHalo - uHaloWidth : uEndCol;
T cardinal0 = 0.0;
T diagonal0 = 0.0;
T center0 = 0.0;
unsigned int cntIterations, i, j;
for (cntIterations = 0; cntIterations < nIters; cntIterations ++)
{
// Do Stencil Operation
for (i = uStartLine; i < uEndLine; i++)
{
T * pCenter = &pTmp [ i * uDimWithHalo];
T * pTop = pCenter - uDimWithHalo;
T * pBottom = pCenter + uDimWithHalo;
T * pOut = &pCrnt[ i * uDimWithHalo];
__assume_aligned(pCenter, 64);
__assume_aligned(pTop, 64);
__assume_aligned(pBottom, 64);
__assume_aligned(pOut, 64);
#pragma simd vectorlengthfor(float)
for (j = uStartCol; j < uEndCol; j++)
{
cardinal0 = pCenter[j - 1] + pCenter[j + 1] + pTop[j] + pBottom[j];
diagonal0 = pTop[j - 1] + pTop[j + 1] + pBottom[j - 1] + pBottom[j + 1];
center0 = pCenter[j];
pOut[j] = wcardinal * cardinal0 + wdiag * diagonal0 + wcenter * center0;
}
}
#pragma omp barrier
;
// Switch pointers
T* pAux = pTmp;
pTmp = pCrnt;
pCrnt = pAux;
} // End For
} // End Parallel
_mm_free(pCrnt);
} // End Offload
#pragma offload target(mic) out(pIn:length(uImgElements) REUSE FREE)
{
// Just copy back pIn
}
//.........这里部分代码省略.........
示例2: sizeof
void
MPICUDAStencil<T>::DoPreIterationWork( T* currBuf, // in device global memory
T* altBuf, // in device global memory
Matrix2D<T>& mtx,
unsigned int iter )
{
// do the halo exchange at desired frequency
// note that we *do not* do the halo exchange here before the
// first iteration, because we did it already (before we first
// pushed the data onto the device) in our operator() method.
unsigned int haloWidth = this->GetNumberIterationsPerHaloExchange();
if( (iter > 0) && (iter % haloWidth) == 0 )
{
unsigned int nRows = mtx.GetNumRows();
unsigned int nCols = mtx.GetNumColumns();
unsigned int nPaddedCols = mtx.GetNumPaddedColumns();
T* flatData = mtx.GetFlatData();
size_t nsDataItemCount = haloWidth * nPaddedCols;
size_t ewDataItemCount = haloWidth * nRows;
size_t nsDataSize = nsDataItemCount * sizeof(T);
size_t ewDataSize = ewDataItemCount * sizeof(T);
//
// read current data off device
// we only read halo, and only for sides where we have a neighbor
//
if( this->HaveNorthNeighbor() )
{
// north data is contiguous - copy directly into matrix
cudaMemcpy( flatData + (haloWidth * nPaddedCols), // dest
currBuf + (haloWidth * nPaddedCols), // src
nsDataSize, // amount to transfer
cudaMemcpyDeviceToHost ); // direction
}
if( this->HaveSouthNeighbor() )
{
// south data is contiguous - copy directly into matrix
cudaMemcpy( flatData + ((nRows - 2*haloWidth)*nPaddedCols), // dest
currBuf + ((nRows - 2*haloWidth)*nPaddedCols), // src
nsDataSize, // amount to transfer
cudaMemcpyDeviceToHost ); // direction
}
if( this->HaveEastNeighbor() )
{
// east data is non-contiguous - but CUDA has a strided read
cudaMemcpy2D( flatData + (nCols - 2*haloWidth), // dest
nPaddedCols * sizeof(T), // dest pitch
currBuf + (nCols - 2*haloWidth), // src
nPaddedCols * sizeof(T), // src pitch
haloWidth * sizeof(T), // width of data to transfer (bytes)
nRows, // height of data to transfer (rows)
cudaMemcpyDeviceToHost ); // transfer direction
}
if( this->HaveWestNeighbor() )
{
// west data is non-contiguous - but CUDA has a strided read
cudaMemcpy2D( flatData + haloWidth, // dest
nPaddedCols * sizeof(T), // dest pitch
currBuf + haloWidth, // src
nPaddedCols * sizeof(T), // src pitch
haloWidth * sizeof(T), // width of data to transfer (bytes)
nRows, // height of data to transfer (rows)
cudaMemcpyDeviceToHost ); // transfer direction
}
//
// do the actual halo exchange
//
if( dumpData )
{
DumpData( ofs, mtx, "before halo exchange" );
}
DoHaloExchange( mtx );
if( dumpData )
{
DumpData( ofs, mtx, "after halo exchange" );
}
//
// push updated data back onto device
// we only write halo, and only for sides where we have a neighbor
//
if( this->HaveNorthNeighbor() )
{
// north data is contiguous - copy directly from matrix
cudaMemcpy( currBuf, // dest
flatData, // src
nsDataSize, // amount to transfer
cudaMemcpyHostToDevice ); // direction
}
if( this->HaveSouthNeighbor() )
{
// south data is contiguous - copy directly from matrix
//.........这里部分代码省略.........