本文整理汇总了C++中TESTING_MALLOC_CPU函数的典型用法代码示例。如果您正苦于以下问题:C++ TESTING_MALLOC_CPU函数的具体用法?C++ TESTING_MALLOC_CPU怎么用?C++ TESTING_MALLOC_CPU使用的例子?那么, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了TESTING_MALLOC_CPU函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: get_LU_error
double get_LU_error(magma_int_t M, magma_int_t N,
magmaDoubleComplex *A, magma_int_t lda,
magmaDoubleComplex *LU, magma_int_t *IPIV)
{
magma_int_t min_mn = min(M,N);
magma_int_t ione = 1;
magma_int_t i, j;
magmaDoubleComplex alpha = MAGMA_Z_ONE;
magmaDoubleComplex beta = MAGMA_Z_ZERO;
magmaDoubleComplex *L, *U;
double work[1], matnorm, residual;
TESTING_MALLOC_CPU( L, magmaDoubleComplex, M*min_mn );
TESTING_MALLOC_CPU( U, magmaDoubleComplex, min_mn*N );
memset( L, 0, M*min_mn*sizeof(magmaDoubleComplex) );
memset( U, 0, min_mn*N*sizeof(magmaDoubleComplex) );
lapackf77_zlaswp( &N, A, &lda, &ione, &min_mn, IPIV, &ione);
lapackf77_zlacpy( MagmaLowerStr, &M, &min_mn, LU, &lda, L, &M );
lapackf77_zlacpy( MagmaUpperStr, &min_mn, &N, LU, &lda, U, &min_mn );
for(j=0; j<min_mn; j++)
L[j+j*M] = MAGMA_Z_MAKE( 1., 0. );
matnorm = lapackf77_zlange("f", &M, &N, A, &lda, work);
blasf77_zgemm("N", "N", &M, &N, &min_mn,
&alpha, L, &M, U, &min_mn, &beta, LU, &lda);
for( j = 0; j < N; j++ ) {
for( i = 0; i < M; i++ ) {
LU[i+j*lda] = MAGMA_Z_SUB( LU[i+j*lda], A[i+j*lda] );
}
}
residual = lapackf77_zlange("f", &M, &N, LU, &lda, work);
TESTING_FREE_CPU( L );
TESTING_FREE_CPU( U );
return residual / (matnorm * N);
}
示例2: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing cprint
*/
int main( int argc, char** argv)
{
TESTING_INIT();
magmaFloatComplex *hA, *dA;
//magma_int_t ione = 1;
//magma_int_t ISEED[4] = {0,0,0,1};
magma_int_t M, N, lda, ldda; //size
magma_int_t status = 0;
magma_opts opts;
parse_opts( argc, argv, &opts );
for( int itest = 0; itest < opts.ntest; ++itest ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
M = opts.msize[itest];
N = opts.nsize[itest];
lda = M;
ldda = ((M + 31)/32)*32;
//size = lda*N;
/* Allocate host memory for the matrix */
TESTING_MALLOC_CPU( hA, magmaFloatComplex, lda *N );
TESTING_MALLOC_DEV( dA, magmaFloatComplex, ldda*N );
//lapackf77_clarnv( &ione, ISEED, &size, hA );
for( int j = 0; j < N; ++j ) {
for( int i = 0; i < M; ++i ) {
hA[i + j*lda] = MAGMA_C_MAKE( i + j*0.01, 0. );
}
}
magma_csetmatrix( M, N, hA, lda, dA, ldda );
printf( "A=" );
magma_cprint( M, N, hA, lda );
printf( "dA=" );
magma_cprint_gpu( M, N, dA, ldda );
TESTING_FREE_CPU( hA );
TESTING_FREE_DEV( dA );
}
}
TESTING_FINALIZE();
return status;
}
示例3: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing cposv_gpu
*/
int main( int argc, char** argv)
{
TESTING_INIT();
real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time;
float error, Rnorm, Anorm, Xnorm, *work;
magmaFloatComplex c_one = MAGMA_C_ONE;
magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
magmaFloatComplex *h_A, *h_B, *h_X;
magmaFloatComplex_ptr d_A, d_B;
magma_int_t N, lda, ldb, ldda, lddb, info, sizeA, sizeB;
magma_int_t ione = 1;
magma_int_t ISEED[4] = {0,0,0,1};
magma_int_t status = 0;
magma_opts opts;
parse_opts( argc, argv, &opts );
float tol = opts.tolerance * lapackf77_slamch("E");
printf("uplo = %s\n", lapack_uplo_const(opts.uplo) );
printf(" N NRHS CPU Gflop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n");
printf("================================================================================\n");
for( int itest = 0; itest < opts.ntest; ++itest ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
N = opts.nsize[itest];
lda = ldb = N;
ldda = ((N+31)/32)*32;
lddb = ldda;
gflops = ( FLOPS_CPOTRF( N ) + FLOPS_CPOTRS( N, opts.nrhs ) ) / 1e9;
TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N );
TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*opts.nrhs );
TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*opts.nrhs );
TESTING_MALLOC_CPU( work, float, N );
TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N );
TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*opts.nrhs );
/* ====================================================================
Initialize the matrix
=================================================================== */
sizeA = lda*N;
sizeB = ldb*opts.nrhs;
lapackf77_clarnv( &ione, ISEED, &sizeA, h_A );
lapackf77_clarnv( &ione, ISEED, &sizeB, h_B );
magma_cmake_hpd( N, h_A, lda );
magma_csetmatrix( N, N, h_A, N, d_A, ldda );
magma_csetmatrix( N, opts.nrhs, h_B, N, d_B, lddb );
/* ====================================================================
Performs operation using MAGMA
=================================================================== */
gpu_time = magma_wtime();
magma_cposv_gpu( opts.uplo, N, opts.nrhs, d_A, ldda, d_B, lddb, &info );
gpu_time = magma_wtime() - gpu_time;
gpu_perf = gflops / gpu_time;
if (info != 0)
printf("magma_cpotrf_gpu returned error %d: %s.\n",
(int) info, magma_strerror( info ));
/* =====================================================================
Residual
=================================================================== */
magma_cgetmatrix( N, opts.nrhs, d_B, lddb, h_X, ldb );
Anorm = lapackf77_clange("I", &N, &N, h_A, &lda, work);
Xnorm = lapackf77_clange("I", &N, &opts.nrhs, h_X, &ldb, work);
blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &opts.nrhs, &N,
&c_one, h_A, &lda,
h_X, &ldb,
&c_neg_one, h_B, &ldb );
Rnorm = lapackf77_clange("I", &N, &opts.nrhs, h_B, &ldb, work);
error = Rnorm/(N*Anorm*Xnorm);
status += ! (error < tol);
/* ====================================================================
Performs operation using LAPACK
=================================================================== */
if ( opts.lapack ) {
cpu_time = magma_wtime();
lapackf77_cposv( lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, h_B, &ldb, &info );
cpu_time = magma_wtime() - cpu_time;
cpu_perf = gflops / cpu_time;
if (info != 0)
printf("lapackf77_cposv returned error %d: %s.\n",
(int) info, magma_strerror( info ));
printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n",
(int) N, (int) opts.nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time,
error, (error < tol ? "ok" : "failed"));
}
else {
printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n",
//.........这里部分代码省略.........
示例4: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing cgemm_batched
*/
int main( int argc, char** argv)
{
TESTING_INIT();
real_Double_t gflops, magma_perf, magma_time, cpu_perf, cpu_time;
float magma_error, magma_err, Ynorm, work[1];
magma_int_t M, N, Xm, Ym, lda, ldda;
magma_int_t sizeA, sizeX, sizeY;
magma_int_t incx = 1;
magma_int_t incy = 1;
magma_int_t ione = 1;
magma_int_t ISEED[4] = {0,0,0,1};
magma_int_t status = 0;
magma_int_t batchCount;
magmaFloatComplex *h_A, *h_X, *h_Y, *h_Ymagma;
magmaFloatComplex *d_A, *d_X, *d_Y;
magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 );
magmaFloatComplex beta = MAGMA_C_MAKE( -0.48, 0.38 );
magmaFloatComplex **A_array = NULL;
magmaFloatComplex **X_array = NULL;
magmaFloatComplex **Y_array = NULL;
magma_opts opts;
parse_opts( argc, argv, &opts );
batchCount = opts.batchcount;
opts.lapack |= opts.check;
//float tol = opts.tolerance * lapackf77_slamch("E");
printf("trans = %s\n", lapack_trans_const(opts.transA) );
printf("BatchCount M N MAGMA Gflop/s (ms) CPU Gflop/s (ms) MAGMA error\n");
printf("===================================================================================================\n");
for( int itest = 0; itest < opts.ntest; ++itest ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
M = opts.msize[itest];
N = opts.nsize[itest];
lda = ((M+31)/32)*32;
gflops = FLOPS_CGEMV( M, N ) / 1e9 * batchCount;
if ( opts.transA == MagmaNoTrans ) {
Xm = N;
Ym = M;
} else {
Xm = M;
Ym = N;
}
sizeA = lda*N*batchCount;
sizeX = incx*Xm*batchCount;
sizeY = incy*Ym*batchCount;
ldda = ((lda+31)/32)*32;
TESTING_MALLOC_CPU( h_A, magmaFloatComplex, sizeA );
TESTING_MALLOC_CPU( h_X, magmaFloatComplex, sizeX );
TESTING_MALLOC_CPU( h_Y, magmaFloatComplex, sizeY );
TESTING_MALLOC_CPU( h_Ymagma, magmaFloatComplex, sizeY );
TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N*batchCount );
TESTING_MALLOC_DEV( d_X, magmaFloatComplex, sizeX );
TESTING_MALLOC_DEV( d_Y, magmaFloatComplex, sizeY );
magma_malloc((void**)&A_array, batchCount * sizeof(*A_array));
magma_malloc((void**)&X_array, batchCount * sizeof(*X_array));
magma_malloc((void**)&Y_array, batchCount * sizeof(*Y_array));
/* Initialize the matrices */
lapackf77_clarnv( &ione, ISEED, &sizeA, h_A );
lapackf77_clarnv( &ione, ISEED, &sizeX, h_X );
lapackf77_clarnv( &ione, ISEED, &sizeY, h_Y );
/* =====================================================================
Performs operation using MAGMABLAS
=================================================================== */
magma_csetmatrix( M, N*batchCount, h_A, lda, d_A, ldda );
magma_csetvector( Xm*batchCount, h_X, incx, d_X, incx );
magma_csetvector( Ym*batchCount, h_Y, incy, d_Y, incy );
cset_pointer(A_array, d_A, ldda, 0, 0, ldda*N, batchCount, magma_stream);
cset_pointer(X_array, d_X, 1, 0, 0, incx*Xm, batchCount, magma_stream);
cset_pointer(Y_array, d_Y, 1, 0, 0, incy*Ym, batchCount, magma_stream);
magma_time = magma_sync_wtime( NULL );
magmablas_cgemv_batched(opts.transA, M, N,
alpha, A_array, ldda,
X_array, incx,
beta, Y_array, incy, batchCount, magma_stream);
magma_time = magma_sync_wtime( NULL ) - magma_time;
magma_perf = gflops / magma_time;
magma_cgetvector( Ym*batchCount, d_Y, incy, h_Ymagma, incy );
//.........这里部分代码省略.........
示例5: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing cgetrf
*/
int main( int argc, char** argv)
{
TESTING_INIT();
real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
float error;
magmaFloatComplex *h_A, *h_R;
magmaFloatComplex *d_A;
magma_int_t *ipiv;
magma_int_t M, N, n2, lda, ldda, info, min_mn;
magma_int_t ione = 1;
magma_int_t ISEED[4] = {0,0,0,1};
magma_int_t status = 0;
magma_opts opts;
parse_opts( argc, argv, &opts );
float tol = opts.tolerance * lapackf77_slamch("E");
printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||PA-LU||/(||A||*N)\n");
printf("=========================================================================\n");
for( int itest = 0; itest < opts.ntest; ++itest ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
M = opts.msize[itest];
N = opts.nsize[itest];
min_mn = min(M, N);
lda = M;
n2 = lda*N;
ldda = ((M+31)/32)*32;
gflops = FLOPS_CGETRF( M, N ) / 1e9;
if ( N > 512 ) {
printf( "%5d %5d skipping because cgetf2 does not support N > 512\n", (int) M, (int) N );
continue;
}
TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn );
TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 );
TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 );
TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N );
/* Initialize the matrix */
lapackf77_clarnv( &ione, ISEED, &n2, h_A );
lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda );
magma_csetmatrix( M, N, h_R, lda, d_A, ldda );
/* =====================================================================
Performs operation using LAPACK
=================================================================== */
if ( opts.lapack ) {
cpu_time = magma_wtime();
lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info);
cpu_time = magma_wtime() - cpu_time;
cpu_perf = gflops / cpu_time;
if (info != 0)
printf("lapackf77_cgetrf returned error %d: %s.\n",
(int) info, magma_strerror( info ));
}
/* ====================================================================
Performs operation using MAGMA
=================================================================== */
gpu_time = magma_wtime();
magma_cgetf2_gpu( M, N, d_A, ldda, ipiv, &info);
gpu_time = magma_wtime() - gpu_time;
gpu_perf = gflops / gpu_time;
if (info != 0)
printf("magma_cgetf2_gpu returned error %d: %s.\n",
(int) info, magma_strerror( info ));
/* =====================================================================
Check the factorization
=================================================================== */
if ( opts.lapack ) {
printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)",
(int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. );
}
else {
printf("%5d %5d --- ( --- ) %7.2f (%7.2f)",
(int) M, (int) N, gpu_perf, gpu_time*1000. );
}
if ( opts.check ) {
magma_cgetmatrix( M, N, d_A, ldda, h_A, lda );
error = get_LU_error( M, N, h_R, lda, h_A, ipiv );
printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed") );
status += ! (error < tol);
}
else {
printf(" --- \n");
}
TESTING_FREE_CPU( ipiv );
TESTING_FREE_CPU( h_A );
TESTING_FREE_PIN( h_R );
TESTING_FREE_DEV( d_A );
fflush( stdout );
}
//.........这里部分代码省略.........
示例6: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing cgehrd
*/
int main( int argc, char** argv)
{
TESTING_INIT();
real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
magmaFloatComplex *h_A, *h_R, *h_Q, *h_work, *tau, *twork;
magmaFloatComplex_ptr dT;
#if defined(PRECISION_z) || defined(PRECISION_c)
float *rwork;
#endif
float eps, result[2];
magma_int_t N, n2, lda, nb, lwork, ltwork, info;
magma_int_t ione = 1;
magma_int_t ISEED[4] = {0,0,0,1};
magma_int_t status = 0;
eps = lapackf77_slamch( "E" );
magma_opts opts;
parse_opts( argc, argv, &opts );
float tol = opts.tolerance * lapackf77_slamch("E");
printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) |A-QHQ'|/N|A| |I-QQ'|/N\n");
printf("=========================================================================\n");
for( int itest = 0; itest < opts.ntest; ++itest ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
N = opts.nsize[itest];
lda = N;
n2 = lda*N;
nb = magma_get_cgehrd_nb(N);
/* We suppose the magma nb is bigger than lapack nb */
lwork = N*nb;
gflops = FLOPS_CGEHRD( N ) / 1e9;
TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 );
TESTING_MALLOC_CPU( tau, magmaFloatComplex, N );
TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 );
TESTING_MALLOC_PIN( h_work, magmaFloatComplex, lwork );
TESTING_MALLOC_DEV( dT, magmaFloatComplex, nb*N );
/* Initialize the matrices */
lapackf77_clarnv( &ione, ISEED, &n2, h_A );
lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
/* ====================================================================
Performs operation using MAGMA
=================================================================== */
gpu_time = magma_wtime();
magma_cgehrd( N, ione, N, h_R, lda, tau, h_work, lwork, dT, &info);
gpu_time = magma_wtime() - gpu_time;
gpu_perf = gflops / gpu_time;
if (info != 0)
printf("magma_cgehrd returned error %d: %s.\n",
(int) info, magma_strerror( info ));
/* =====================================================================
Check the factorization
=================================================================== */
if ( opts.check ) {
ltwork = 2*(N*N);
TESTING_MALLOC_PIN( h_Q, magmaFloatComplex, lda*N );
TESTING_MALLOC_CPU( twork, magmaFloatComplex, ltwork );
#if defined(PRECISION_z) || defined(PRECISION_c)
TESTING_MALLOC_CPU( rwork, float, N );
#endif
lapackf77_clacpy(MagmaUpperLowerStr, &N, &N, h_R, &lda, h_Q, &lda);
for( int j = 0; j < N-1; ++j )
for( int i = j+2; i < N; ++i )
h_R[i+j*lda] = MAGMA_C_ZERO;
magma_cunghr(N, ione, N, h_Q, lda, tau, dT, nb, &info);
if (info != 0) {
printf("magma_cunghr returned error %d: %s.\n",
(int) info, magma_strerror( info ));
exit(1);
}
#if defined(PRECISION_z) || defined(PRECISION_c)
lapackf77_chst01(&N, &ione, &N,
h_A, &lda, h_R, &lda,
h_Q, &lda, twork, <work, rwork, result);
#else
lapackf77_chst01(&N, &ione, &N,
h_A, &lda, h_R, &lda,
h_Q, &lda, twork, <work, result);
#endif
TESTING_FREE_PIN( h_Q );
TESTING_FREE_CPU( twork );
#if defined(PRECISION_z) || defined(PRECISION_c)
TESTING_FREE_CPU( rwork );
#endif
}
//.........这里部分代码省略.........
示例7: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing clarfb_gpu
*/
int main( int argc, char** argv )
{
TESTING_INIT();
magmaFloatComplex c_zero = MAGMA_C_ZERO;
magmaFloatComplex c_one = MAGMA_C_ONE;
magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
magma_int_t M, N, K, size, ldc, ldv, ldt, ldw, nv;
magma_int_t ione = 1;
magma_int_t ISEED[4] = {0,0,0,1};
float error, work[1];
magma_int_t status = 0;
// test all combinations of input parameters
magma_side_t side [] = { MagmaLeft, MagmaRight };
magma_trans_t trans [] = { MagmaConjTrans, MagmaNoTrans };
magma_direct_t direct[] = { MagmaForward, MagmaBackward };
magma_storev_t storev[] = { MagmaColumnwise, MagmaRowwise };
magma_opts opts;
parse_opts( argc, argv, &opts );
float tol = opts.tolerance * lapackf77_slamch("E");
printf(" M N K storev side direct trans ||R||_F / ||HC||_F\n");
printf("========================================================================\n");
for( int itest = 0; itest < opts.ntest; ++itest ) {
M = opts.msize[itest];
N = opts.nsize[itest];
K = opts.ksize[itest];
if ( M < K || N < K || K <= 0 ) {
printf( "%5d %5d %5d skipping because clarfb requires M >= K, N >= K, K >= 0\n",
(int) M, (int) N, (int) K );
continue;
}
for( int istor = 0; istor < 2; ++istor ) {
for( int iside = 0; iside < 2; ++iside ) {
for( int idir = 0; idir < 2; ++idir ) {
for( int itran = 0; itran < 2; ++itran ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
ldc = ((M+31)/32)*32;
ldt = ((K+31)/32)*32;
ldw = (side[iside] == MagmaLeft ? N : M);
// (ldv, nv) get swapped later if rowwise
ldv = (side[iside] == MagmaLeft ? M : N);
nv = K;
// Allocate memory for matrices
magmaFloatComplex *C, *R, *V, *T, *W;
TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*N );
TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*N );
TESTING_MALLOC_CPU( V, magmaFloatComplex, ldv*K );
TESTING_MALLOC_CPU( T, magmaFloatComplex, ldt*K );
TESTING_MALLOC_CPU( W, magmaFloatComplex, ldw*K );
magmaFloatComplex_ptr dC, dV, dT, dW;
TESTING_MALLOC_DEV( dC, magmaFloatComplex, ldc*N );
TESTING_MALLOC_DEV( dV, magmaFloatComplex, ldv*K );
TESTING_MALLOC_DEV( dT, magmaFloatComplex, ldt*K );
TESTING_MALLOC_DEV( dW, magmaFloatComplex, ldw*K );
// C is M x N.
size = ldc*N;
lapackf77_clarnv( &ione, ISEED, &size, C );
//printf( "C=" ); magma_cprint( M, N, C, ldc );
// V is ldv x nv. See larfb docs for description.
// if column-wise and left, M x K
// if column-wise and right, N x K
// if row-wise and left, K x M
// if row-wise and right, K x N
size = ldv*nv;
lapackf77_clarnv( &ione, ISEED, &size, V );
if ( storev[istor] == MagmaColumnwise ) {
if ( direct[idir] == MagmaForward ) {
lapackf77_claset( MagmaUpperStr, &K, &K, &c_zero, &c_one, V, &ldv );
}
else {
lapackf77_claset( MagmaLowerStr, &K, &K, &c_zero, &c_one, &V[(ldv-K)], &ldv );
}
}
else {
// rowwise, swap V's dimensions
std::swap( ldv, nv );
if ( direct[idir] == MagmaForward ) {
lapackf77_claset( MagmaLowerStr, &K, &K, &c_zero, &c_one, V, &ldv );
}
else {
lapackf77_claset( MagmaUpperStr, &K, &K, &c_zero, &c_one, &V[(nv-K)*ldv], &ldv );
}
}
//printf( "# ldv %d, nv %d\n", ldv, nv );
//printf( "V=" ); magma_cprint( ldv, nv, V, ldv );
// T is K x K, upper triangular for forward, and lower triangular for backward
magma_int_t k1 = K-1;
size = ldt*K;
//.........这里部分代码省略.........
示例8: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing zhesv
*/
int main( int argc, char** argv)
{
TESTING_INIT();
magmaDoubleComplex *h_A, *h_B, *h_X, *work, temp;
real_Double_t gflops, gpu_perf, gpu_time = 0.0, cpu_perf=0, cpu_time=0;
double error, error_lapack = 0.0;
magma_int_t *ipiv;
magma_int_t N, n2, lda, ldb, sizeB, lwork, info;
magma_int_t status = 0, ione = 1;
magma_int_t ISEED[4] = {0,0,0,1};
magma_opts opts;
opts.parse_opts( argc, argv );
double tol = opts.tolerance * lapackf77_dlamch("E");
printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |Ax-b|/(N*|A|*|x|)\n");
printf("%%========================================================================\n");
for( int itest = 0; itest < opts.ntest; ++itest ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
N = opts.nsize[itest];
ldb = N;
lda = N;
n2 = lda*N;
sizeB = ldb*opts.nrhs;
gflops = ( FLOPS_ZPOTRF( N ) + FLOPS_ZPOTRS( N, opts.nrhs ) ) / 1e9;
TESTING_MALLOC_CPU( ipiv, magma_int_t, N );
TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, n2 );
TESTING_MALLOC_PIN( h_B, magmaDoubleComplex, sizeB );
TESTING_MALLOC_PIN( h_X, magmaDoubleComplex, sizeB );
/* =====================================================================
Performs operation using LAPACK
=================================================================== */
if ( opts.lapack ) {
lwork = -1;
lapackf77_zhesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs,
h_A, &lda, ipiv, h_X, &ldb, &temp, &lwork, &info);
lwork = (int)MAGMA_Z_REAL(temp);
TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork );
init_matrix( N, N, h_A, lda );
lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B );
lapackf77_zlacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb );
cpu_time = magma_wtime();
lapackf77_zhesv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs,
h_A, &lda, ipiv, h_X, &ldb, work, &lwork, &info);
cpu_time = magma_wtime() - cpu_time;
cpu_perf = gflops / cpu_time;
if (info != 0) {
printf("lapackf77_zhesv returned error %d: %s.\n",
(int) info, magma_strerror( info ));
}
error_lapack = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb );
TESTING_FREE_CPU( work );
}
/* ====================================================================
Performs operation using MAGMA
=================================================================== */
init_matrix( N, N, h_A, lda );
lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B );
lapackf77_zlacpy( MagmaFullStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb );
magma_setdevice(0);
gpu_time = magma_wtime();
magma_zhesv( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, &info);
gpu_time = magma_wtime() - gpu_time;
gpu_perf = gflops / gpu_time;
if (info != 0) {
printf("magma_zhesv returned error %d: %s.\n",
(int) info, magma_strerror( info ));
}
/* =====================================================================
Check the factorization
=================================================================== */
if ( opts.lapack ) {
printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)",
(int) N, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
}
else {
printf("%5d %5d --- ( --- ) %7.2f (%7.2f)",
(int) N, (int) N, gpu_perf, gpu_time );
}
if ( opts.check == 0 ) {
printf(" --- \n");
} else {
error = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb );
printf(" %8.2e %s", error, (error < tol ? "ok" : "failed"));
if (opts.lapack)
printf(" (lapack rel.res. = %8.2e)", error_lapack);
printf("\n");
//.........这里部分代码省略.........
示例9: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing zposv_batched
*/
int main(int argc, char **argv)
{
TESTING_INIT();
real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time;
double err = 0.0, Rnorm, Anorm, Xnorm, *work;
magmaDoubleComplex c_one = MAGMA_Z_ONE;
magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
magmaDoubleComplex *h_A, *h_B, *h_X;
magmaDoubleComplex_ptr d_A, d_B;
magma_int_t *dinfo_array;
magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB;
magma_int_t ione = 1;
magma_int_t ISEED[4] = {0,0,0,1};
magma_int_t status = 0;
magma_int_t batchCount = 1;
magmaDoubleComplex **dA_array = NULL;
magmaDoubleComplex **dB_array = NULL;
magma_queue_t queue = magma_stream;
magma_opts opts;
parse_opts( argc, argv, &opts );
double tol = opts.tolerance * lapackf77_dlamch("E");
nrhs = opts.nrhs;
batchCount = opts.batchcount ;
printf("uplo = %s\n", lapack_uplo_const(opts.uplo) );
printf("BatchCount N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n");
printf("================================================================================\n");
for( int itest = 0; itest < opts.ntest; ++itest ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
N = opts.nsize[itest];
lda = N;
ldb = lda;
ldda = ((N+31)/32)*32;
lddb = ldda;
gflops = ( FLOPS_ZPOTRF( N) + FLOPS_ZPOTRS( N, nrhs ) ) / 1e9 * batchCount;
sizeA = lda*N*batchCount;
sizeB = ldb*nrhs*batchCount;
TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, sizeA );
TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, sizeB );
TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, sizeB );
TESTING_MALLOC_CPU( work, double, N);
TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N*batchCount );
TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs*batchCount );
TESTING_MALLOC_DEV( dinfo_array, magma_int_t, batchCount );
magma_malloc((void**)&dA_array, batchCount * sizeof(*dA_array));
magma_malloc((void**)&dB_array, batchCount * sizeof(*dB_array));
/* Initialize the matrices */
lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A );
lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B );
for(int i=0; i<batchCount; i++)
{
magma_zmake_hpd( N, h_A + i * lda * N, lda );// need modification
}
magma_zsetmatrix( N, N*batchCount, h_A, lda, d_A, ldda );
magma_zsetmatrix( N, nrhs*batchCount, h_B, ldb, d_B, lddb );
/* ====================================================================
Performs operation using MAGMA
=================================================================== */
zset_pointer(dA_array, d_A, ldda, 0, 0, ldda*N, batchCount, queue);
zset_pointer(dB_array, d_B, lddb, 0, 0, lddb*nrhs, batchCount, queue);
gpu_time = magma_wtime();
info = magma_zposv_batched(opts.uplo, N, nrhs, dA_array, ldda, dB_array, lddb, dinfo_array, batchCount, queue);
gpu_time = magma_wtime() - gpu_time;
gpu_perf = gflops / gpu_time;
// check correctness of results throught "dinfo_magma" and correctness of argument throught "info"
magma_int_t *cpu_info = (magma_int_t*) malloc(batchCount*sizeof(magma_int_t));
magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1);
for(int i=0; i<batchCount; i++)
{
if(cpu_info[i] != 0 ){
printf("magma_zposv_batched matrix %d returned internal error %d\n",i, (int)cpu_info[i] );
}
}
if (info != 0)
printf("magma_zposv_batched returned argument error %d: %s.\n", (int) info, magma_strerror( info ));
//=====================================================================
// Residual
//=====================================================================
magma_zgetmatrix( N, nrhs*batchCount, d_B, lddb, h_X, ldb );
//.........这里部分代码省略.........
示例10: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing chegvdx
*/
int main( int argc, char** argv)
{
TESTING_INIT();
/* Constants */
const magmaFloatComplex c_zero = MAGMA_C_ZERO;
const magmaFloatComplex c_one = MAGMA_C_ONE;
const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
const magma_int_t ione = 1;
/* Local variables */
real_Double_t gpu_time;
magmaFloatComplex *h_A, *h_R, *h_B, *h_S, *h_work;
#ifdef COMPLEX
float *rwork;
magma_int_t lrwork;
#endif
float *w1, *w2, result[2]={0,0};
magma_int_t *iwork;
magma_int_t N, n2, info, lda, lwork, liwork;
magma_int_t ISEED[4] = {0,0,0,1};
magma_int_t status = 0;
magma_opts opts;
opts.parse_opts( argc, argv );
float tol = opts.tolerance * lapackf77_slamch("E");
float tolulp = opts.tolerance * lapackf77_slamch("P");
magma_range_t range = MagmaRangeAll;
if (opts.fraction != 1)
range = MagmaRangeI;
// pass ngpu = -1 to test multi-GPU code using 1 gpu
magma_int_t abs_ngpu = abs( opts.ngpu );
printf("%% itype = %d, jobz = %s, range = %s, uplo = %s, fraction = %6.4f, ngpu = %d\n",
int(opts.itype), lapack_vec_const(opts.jobz), lapack_range_const(range), lapack_uplo_const(opts.uplo),
opts.fraction, int(abs_ngpu) );
if (opts.itype == 1) {
printf("%% N M GPU Time (sec) |AZ-BZD| |D - D_magma|\n");
}
else if (opts.itype == 2) {
printf("%% N M GPU Time (sec) |ABZ-ZD| |D - D_magma|\n");
}
else if (opts.itype == 3) {
printf("%% N M GPU Time (sec) |BAZ-ZD| |D - D_magma|\n");
}
printf("%%======================================================\n");
magma_int_t threads = magma_get_parallel_numthreads();
for( int itest = 0; itest < opts.ntest; ++itest ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
N = opts.nsize[itest];
lda = N;
n2 = lda*N;
// TODO: test vl-vu range
magma_int_t m1 = 0;
float vl = 0;
float vu = 0;
magma_int_t il = 0;
magma_int_t iu = 0;
if (opts.fraction == 0) {
il = max( 1, magma_int_t(0.1*N) );
iu = max( 1, magma_int_t(0.3*N) );
}
else {
il = 1;
iu = max( 1, magma_int_t(opts.fraction*N) );
}
magma_cheevdx_getworksize(N, threads, (opts.jobz == MagmaVec),
&lwork,
#ifdef COMPLEX
&lrwork,
#endif
&liwork);
/* Allocate host memory for the matrix */
TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 );
TESTING_MALLOC_CPU( h_B, magmaFloatComplex, n2 );
TESTING_MALLOC_CPU( w1, float, N );
TESTING_MALLOC_CPU( w2, float, N );
TESTING_MALLOC_CPU( iwork, magma_int_t, liwork );
TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 );
TESTING_MALLOC_PIN( h_S, magmaFloatComplex, n2 );
TESTING_MALLOC_PIN( h_work, magmaFloatComplex, max( lwork, N*N )); // check needs N*N
#ifdef COMPLEX
TESTING_MALLOC_PIN( rwork, float, lrwork);
#endif
/* Initialize the matrix */
lapackf77_clarnv( &ione, ISEED, &n2, h_A );
//.........这里部分代码省略.........
示例11: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing zcgeqrsv
*/
int main( int argc, char** argv)
{
TESTING_INIT();
real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time, gpu_perfd, gpu_perfs;
double error, gpu_error, cpu_error, Anorm, work[1];
magmaDoubleComplex c_one = MAGMA_Z_ONE;
magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
magmaDoubleComplex *h_A, *h_A2, *h_B, *h_X, *h_R;
magmaDoubleComplex_ptr d_A, d_B, d_X, d_T;
magmaFloatComplex *d_SA, *d_SB;
magmaDoubleComplex *h_workd, *tau, tmp[1];
magmaFloatComplex *h_works;
magma_int_t lda, ldb, lhwork, lworkgpu;
magma_int_t ldda, lddb, lddx;
magma_int_t M, N, nrhs, qrsv_iters, info, size, min_mn, max_mn, nb;
magma_int_t ione = 1;
magma_int_t ISEED[4] = {0,0,0,1};
printf("Epsilon(double): %8.6e\n"
"Epsilon(single): %8.6e\n\n",
lapackf77_dlamch("Epsilon"), lapackf77_slamch("Epsilon") );
magma_int_t status = 0;
magma_opts opts;
parse_opts( argc, argv, &opts );
double tol = opts.tolerance * lapackf77_dlamch("E");
nrhs = opts.nrhs;
printf(" CPU Gflop/s GPU Gflop/s |b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n");
printf(" M N NRHS double double single mixed Iter CPU GPU \n");
printf("=============================================================================================================\n");
for( int itest = 0; itest < opts.ntest; ++itest ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
M = opts.msize[itest];
N = opts.nsize[itest];
if ( M < N ) {
printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs );
continue;
}
min_mn = min(M, N);
max_mn = max(M, N);
lda = M;
ldb = max_mn;
ldda = ((M+31)/32) * 32;
lddb = ((max_mn+31)/32)*32;
lddx = ((N+31)/32) * 32;
nb = max( magma_get_zgeqrf_nb( M ), magma_get_cgeqrf_nb( M ) );
gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRS( M, N, nrhs )) / 1e9;
lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb;
// query for workspace size
lhwork = -1;
lapackf77_zgels( MagmaNoTransStr, &M, &N, &nrhs,
NULL, &lda, NULL, &ldb, tmp, &lhwork, &info );
lhwork = (magma_int_t) MAGMA_Z_REAL( tmp[0] );
lhwork = max( lhwork, lworkgpu );
TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn );
TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N );
TESTING_MALLOC_CPU( h_A2, magmaDoubleComplex, lda*N );
TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, ldb*nrhs );
TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, ldb*nrhs );
TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, ldb*nrhs );
TESTING_MALLOC_CPU( h_workd, magmaDoubleComplex, lhwork );
h_works = (magmaFloatComplex*)h_workd;
TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N );
TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs );
TESTING_MALLOC_DEV( d_X, magmaDoubleComplex, lddx*nrhs );
TESTING_MALLOC_DEV( d_T, magmaDoubleComplex, ( 2*min_mn + (N+31)/32*32 )*nb );
/* Initialize the matrices */
size = lda*N;
lapackf77_zlarnv( &ione, ISEED, &size, h_A );
lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda );
// make random RHS
size = ldb*nrhs;
lapackf77_zlarnv( &ione, ISEED, &size, h_B );
lapackf77_zlacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb );
magma_zsetmatrix( M, N, h_A, lda, d_A, ldda );
magma_zsetmatrix( M, nrhs, h_B, ldb, d_B, lddb );
//=====================================================================
// Mixed Precision Iterative Refinement - GPU
//=====================================================================
gpu_time = magma_wtime();
magma_zcgeqrsv_gpu( M, N, nrhs,
d_A, ldda, d_B, lddb,
d_X, lddx, &qrsv_iters, &info );
gpu_time = magma_wtime() - gpu_time;
gpu_perf = gflops / gpu_time;
//.........这里部分代码省略.........
示例12: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing cunmbr
*/
int main( int argc, char** argv )
{
TESTING_INIT();
real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
float error, dwork[1];
magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
magma_int_t ione = 1;
magma_int_t m, n, k, mi, ni, mm, nn, nq, size, info;
magma_int_t ISEED[4] = {0,0,0,1};
magma_int_t nb, ldc, lda, lwork, lwork_max;
magmaFloatComplex *C, *R, *A, *work, *tau, *tauq, *taup;
float *d, *e;
magma_int_t status = 0;
magma_opts opts;
parse_opts( argc, argv, &opts );
// need slightly looser bound (60*eps instead of 30*eps) for some tests
opts.tolerance = max( 60., opts.tolerance );
float tol = opts.tolerance * lapackf77_slamch("E");
// test all combinations of input parameters
magma_vect_t vect [] = { MagmaQ, MagmaP };
magma_side_t side [] = { MagmaLeft, MagmaRight };
magma_trans_t trans[] = { Magma_ConjTrans, MagmaNoTrans };
printf(" M N K vect side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n");
printf("===============================================================================================\n");
for( int itest = 0; itest < opts.ntest; ++itest ) {
for( int ivect = 0; ivect < 2; ++ivect ) {
for( int iside = 0; iside < 2; ++iside ) {
for( int itran = 0; itran < 2; ++itran ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
m = opts.msize[itest];
n = opts.nsize[itest];
k = opts.ksize[itest];
nb = magma_get_cgebrd_nb( m );
ldc = m;
// A is nq x k (vect=Q) or k x nq (vect=P)
// where nq=m (left) or nq=n (right)
nq = (side[iside] == MagmaLeft ? m : n );
mm = (vect[ivect] == MagmaQ ? nq : k );
nn = (vect[ivect] == MagmaQ ? k : nq);
lda = mm;
// MBR calls either MQR or MLQ in various ways
if ( vect[ivect] == MagmaQ ) {
if ( nq >= k ) {
gflops = FLOPS_CUNMQR( m, n, k, side[iside] ) / 1e9;
}
else {
if ( side[iside] == MagmaLeft ) {
mi = m - 1;
ni = n;
}
else {
mi = m;
ni = n - 1;
}
gflops = FLOPS_CUNMQR( mi, ni, nq-1, side[iside] ) / 1e9;
}
}
else {
if ( nq > k ) {
gflops = FLOPS_CUNMLQ( m, n, k, side[iside] ) / 1e9;
}
else {
if ( side[iside] == MagmaLeft ) {
mi = m - 1;
ni = n;
}
else {
mi = m;
ni = n - 1;
}
gflops = FLOPS_CUNMLQ( mi, ni, nq-1, side[iside] ) / 1e9;
}
}
// workspace for gebrd is (mm + nn)*nb
// workspace for unmbr is m*nb or n*nb, depending on side
lwork_max = max( (mm + nn)*nb, max( m*nb, n*nb ));
TESTING_MALLOC_CPU( C, magmaFloatComplex, ldc*n );
TESTING_MALLOC_CPU( R, magmaFloatComplex, ldc*n );
TESTING_MALLOC_CPU( A, magmaFloatComplex, lda*nn );
TESTING_MALLOC_CPU( work, magmaFloatComplex, lwork_max );
TESTING_MALLOC_CPU( d, float, min(mm,nn) );
TESTING_MALLOC_CPU( e, float, min(mm,nn) );
TESTING_MALLOC_CPU( tauq, magmaFloatComplex, min(mm,nn) );
TESTING_MALLOC_CPU( taup, magmaFloatComplex, min(mm,nn) );
// C is full, m x n
size = ldc*n;
lapackf77_clarnv( &ione, ISEED, &size, C );
lapackf77_clacpy( "Full", &m, &n, C, &ldc, R, &ldc );
//.........这里部分代码省略.........
示例13: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing zgeqlf
*/
int main( int argc, char** argv)
{
TESTING_INIT();
real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
double error, work[1];
magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1];
magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb;
magma_int_t ione = 1;
magma_int_t ISEED[4] = {0,0,0,1};
magma_int_t status = 0;
magma_opts opts;
parse_opts( argc, argv, &opts );
double tol = 2. * opts.tolerance * lapackf77_dlamch("E");
printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n");
printf("=======================================================================\n");
for( int itest = 0; itest < opts.ntest; ++itest ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
M = opts.msize[itest];
N = opts.nsize[itest];
min_mn = min(M, N);
lda = M;
n2 = lda*N;
ldda = ((M+31)/32)*32;
nb = magma_get_zgeqrf_nb(M);
gflops = FLOPS_ZGEQLF( M, N ) / 1e9;
// query for workspace size
lwork = -1;
lapackf77_zgeqlf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info);
lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] );
lwork = max( lwork, N*nb );
lwork = max( lwork, 2*nb*nb);
TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn );
TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 );
TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lwork );
TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 );
/* Initialize the matrix */
lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda );
/* ====================================================================
Performs operation using MAGMA
=================================================================== */
gpu_time = magma_wtime();
magma_zgeqlf( M, N, h_R, lda, tau, h_work, lwork, &info);
gpu_time = magma_wtime() - gpu_time;
gpu_perf = gflops / gpu_time;
if (info != 0)
printf("magma_zgeqlf returned error %d: %s.\n",
(int) info, magma_strerror( info ));
/* =====================================================================
Performs operation using LAPACK
=================================================================== */
cpu_time = magma_wtime();
lapackf77_zgeqlf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info);
cpu_time = magma_wtime() - cpu_time;
cpu_perf = gflops / cpu_time;
if (info != 0)
printf("lapack_zgeqlf returned error %d: %s.\n",
(int) info, magma_strerror( info ));
/* =====================================================================
Check the result compared to LAPACK
=================================================================== */
error = lapackf77_zlange("f", &M, &N, h_A, &lda, work);
blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione);
error = lapackf77_zlange("f", &M, &N, h_R, &lda, work) / error;
printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n",
(int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time,
error, (error < tol ? "ok" : "failed"));
status += ! (error < tol);
TESTING_FREE_CPU( tau );
TESTING_FREE_CPU( h_A );
TESTING_FREE_CPU( h_work );
TESTING_FREE_PIN( h_R );
fflush( stdout );
}
if ( opts.niter > 1 ) {
printf( "\n" );
}
}
TESTING_FINALIZE();
return status;
}
示例14: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing zgetrf
*/
int main( int argc, char** argv)
{
real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error;
magmaDoubleComplex *h_A, *h_R;
magmaDoubleComplex_ptr d_A, dwork;
magma_int_t N = 0, n2, lda, ldda;
magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 5600, 5600, 5600, 5600, 5600 };
magma_int_t ntest = 10;
magma_int_t i, info;
magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
magma_int_t ione = 1;
magma_int_t ISEED[4] = {0, 0, 0, 1};
magmaDoubleComplex *work;
magmaDoubleComplex tmp;
double rwork[1];
magma_int_t *ipiv;
magma_int_t lwork, ldwork;
double A_norm, R_norm;
if (argc != 1){
for(i = 1; i<argc; i++){
if (strcmp("-N", argv[i])==0)
N = atoi(argv[++i]);
}
if (N>0) size[0] = size[ntest-1] = N;
else exit(1);
}
else {
printf("\nUsage: \n");
printf(" testing_zgetri_gpu -N %d\n\n", 1024);
}
/* query for Lapack workspace size */
N = size[ntest-1];
lda = N;
work = &tmp;
lwork = -1;
lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info );
if (info != 0)
printf("lapackf77_zgetri returned error %d\n", (int) info);
lwork = int( MAGMA_Z_REAL( *work ));
/* query for Magma workspace size */
ldwork = N * magma_get_zgetri_nb( N );
/* Initialize */
magma_queue_t queue;
magma_device_t device[ MagmaMaxGPUs ];
int num = 0;
magma_err_t err;
magma_init();
err = magma_get_devices( device, MagmaMaxGPUs, &num );
if ( err != 0 || num < 1 ) {
fprintf( stderr, "magma_get_devices failed: %d\n", err );
exit(-1);
}
err = magma_queue_create( device[0], &queue );
if ( err != 0 ) {
fprintf( stderr, "magma_queue_create failed: %d\n", err );
exit(-1);
}
/* Allocate memory */
n2 = N * N;
ldda = ((N+31)/32) * 32;
TESTING_MALLOC_CPU( ipiv, magma_int_t, N );
TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork );
TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 );
TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 );
TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N );
TESTING_MALLOC_DEV( dwork, magmaDoubleComplex, ldwork );
printf(" N CPU GFlop/s GPU GFlop/s ||R||_F / ||A||_F\n");
printf("========================================================\n");
for( i=0; i < ntest; i++ ){
N = size[i];
lda = N;
n2 = lda*N;
gflops = FLOPS_ZGETRI( (double)N ) / 1e9;
ldda = ((N+31)/32)*32;
/* Initialize the matrix */
lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
A_norm = lapackf77_zlange( "f", &N, &N, h_A, &lda, rwork );
/* Factor the matrix. Both MAGMA and LAPACK will use this factor. */
magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue );
magma_zgetrf_gpu( N, N, d_A, 0, ldda, ipiv, &info, queue );
magma_zgetmatrix( N, N, d_A, 0, ldda, h_A, 0, lda, queue );
// check for exact singularity
//h_A[ 10 + 10*lda ] = MAGMA_Z_MAKE( 0.0, 0.0 );
//magma_zsetmatrix( N, N, h_A, lda, d_A, ldda );
//.........这里部分代码省略.........
示例15: main
/* ////////////////////////////////////////////////////////////////////////////
-- Testing magma_zhemm_mgpu
*/
int main( int argc, char** argv)
{
TESTING_INIT();
magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
magmaDoubleComplex calpha = MAGMA_Z_MAKE( 3.456, 5.678 );
magmaDoubleComplex cbeta = MAGMA_Z_MAKE( 1.234, 2.456 );
real_Double_t gflops, gpu_perf=0., cpu_perf=0., gpu_time=0., cpu_time=0.;
real_Double_t gpu_perf2=0., gpu_time2=0.;
double error=0., errorbis=0., work[1];
magmaDoubleComplex *hA, *hX, *hB, *hR;
magmaDoubleComplex_ptr dA[MagmaMaxGPUs], dX[MagmaMaxGPUs], dB[MagmaMaxGPUs], dwork[MagmaMaxGPUs], hwork[MagmaMaxGPUs+1];
magmaDoubleComplex_ptr dA2;
magma_int_t M, N, size, lda, ldda, msize, nb, nstream;
magma_int_t ione = 1;
magma_int_t iseed[4] = {0,0,0,1};
magma_int_t status = 0;
magma_opts opts;
parse_opts( argc, argv, &opts );
double tol = opts.tolerance * lapackf77_dlamch("E");
// default values
nb = (opts.nb > 0 ? opts.nb : 64);
nstream = (opts.nstream > 0 ? opts.nstream : 2);
magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2];
magma_int_t nbcmplx = 0;
magma_buildconnection_mgpu(gnode, &nbcmplx, opts.ngpu);
printf("Initializing communication pattern... GPU-ncmplx %d\n\n", (int) nbcmplx);
for (int i=0; i < nbcmplx; ++i) {
int myngpu = gnode[i][MagmaMaxGPUs];
printf("cmplx %d has %d gpu ", i, myngpu);
for(int j=0; j < myngpu; ++j)
printf(" %d", (int) gnode[i][j]);
printf("\n");
}
magma_int_t nbevents = 2;
magma_queue_t streams[MagmaMaxGPUs][20];
magma_event_t redevents[MagmaMaxGPUs][20];
magma_event_t redevents2[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10];
for( int d = 0; d < opts.ngpu; ++d ) {
for( magma_int_t i = 0; i < nstream; ++i ) {
magma_queue_create( &streams[d][i] );
}
for( magma_int_t i = 0; i < nbevents; ++i ) {
cudaEventCreateWithFlags(&redevents[d][i], cudaEventDisableTiming);
cudaEventCreateWithFlags(&redevents2[d][i], cudaEventDisableTiming);
}
}
printf( "nb %d, ngpu %d, nstream %d version %d\n", (int) nb, (int) opts.ngpu, (int) nstream, (int) opts.version );
printf(" M N nb offset CPU GFlop/s (sec) GPU GFlop/s (sec) CUBLAS hemm (sec) ||R|| / ||A||*||X||\n");
printf("=========================================================================================================\n");
for( int itest = 0; itest < opts.ntest; ++itest ) {
M = opts.msize[itest];
N = opts.nsize[itest];
for( int offset = 0; offset < N; offset += min(N,nb) ) {
for( int iter = 0; iter < opts.niter; ++iter ) {
msize = M - offset;
lda = M;
ldda = ((M + 31)/32)*32;
size = lda*M;
gflops = FLOPS_ZHEMM( MagmaLeft, (double)msize, (double)N ) / 1e9;
magma_int_t dworksiz = ldda*N*3;
magma_int_t hworksiz = lda*N;
TESTING_MALLOC_CPU( hA, magmaDoubleComplex, lda*M );
TESTING_MALLOC_CPU( hX, magmaDoubleComplex, lda*N );
TESTING_MALLOC_CPU( hB, magmaDoubleComplex, lda*N );
TESTING_MALLOC_PIN( hR, magmaDoubleComplex, lda*N );
for( int d = 0; d < opts.ngpu; ++d ) {
magma_int_t mlocal = ((M / nb) / opts.ngpu + 1) * nb;
magma_setdevice( d );
TESTING_MALLOC_DEV( dA[d], magmaDoubleComplex, ldda*mlocal );
TESTING_MALLOC_DEV( dX[d], magmaDoubleComplex, ldda*N );
TESTING_MALLOC_DEV( dB[d], magmaDoubleComplex, ldda*N );
TESTING_MALLOC_DEV( dwork[d], magmaDoubleComplex, dworksiz );
TESTING_MALLOC_PIN( hwork[d], magmaDoubleComplex, hworksiz );
}
TESTING_MALLOC_PIN( hwork[opts.ngpu], magmaDoubleComplex, lda*N );
if ( opts.check ) {
magma_setdevice( 0 );
TESTING_MALLOC_DEV( dA2, magmaDoubleComplex, ldda*M );
}
lapackf77_zlarnv( &ione, iseed, &size, hA );
magma_zmake_hermitian( M, hA, lda );
//.........这里部分代码省略.........