本文整理匯總了C++中FLA_Repart_2x1_to_3x1函數的典型用法代碼示例。如果您正苦於以下問題:C++ FLA_Repart_2x1_to_3x1函數的具體用法?C++ FLA_Repart_2x1_to_3x1怎麽用?C++ FLA_Repart_2x1_to_3x1使用的例子?那麽, 這裏精選的函數代碼示例或許可以為您提供幫助。
在下文中一共展示了FLA_Repart_2x1_to_3x1函數的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的C++代碼示例。
示例1: FLA_QR_UT_piv_blk_var2
FLA_Error FLA_QR_UT_piv_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p, fla_qrut_t* cntl )
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj TL, TR, T0, T1, W12;
FLA_Obj TT, TB;
FLA_Obj pT, p0,
pB, p1,
p2;
FLA_Obj wT, w0,
wB, w1,
w2;
dim_t b_alg, b;
// Query the algorithmic blocksize by inspecting the length of T.
b_alg = FLA_Obj_length( T );
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT );
FLA_Part_2x1( p, &pT,
&pB, 0, FLA_TOP );
FLA_Part_2x1( w, &wT,
&wB, 0, FLA_TOP );
while ( FLA_Obj_min_dim( ABR ) > 0 ){
b = min( b_alg, FLA_Obj_min_dim( ABR ) );
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
/* ************* */ /* ******************** */
&A10, /**/ &A11, &A12,
ABL, /**/ ABR, &A20, /**/ &A21, &A22,
b, b, FLA_BR );
FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &W12,
b, FLA_RIGHT );
FLA_Repart_2x1_to_3x1( pT, &p0,
/* ** */ /* ** */
&p1,
pB, &p2, b, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( wT, &w0,
/* ** */ /* ** */
&w1,
wB, &w2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
// ** Reshape T matrices to match the blocksize b
FLA_Part_2x1( TR, &TT,
&TB, b, FLA_TOP );
// ** Perform a unblocked (BLAS2-oriented) QR factorization
// with pivoting via the UT transform on ABR:
//
// ABR -> QB1 R11
//
// where:
// - QB1 is formed from UB1 (which is stored column-wise below the
// diagonal of ( A11 A21 )^T and the upper-triangle of T1.
// - R11 is stored to ( A11 A12 ).
// - W12 stores T and partial updates for FLA_Apply_Q_UT_piv_var.
FLA_QR_UT_piv_internal( ABR, TT, wB, p1,
FLA_Cntl_sub_qrut( cntl ) );
if ( FLA_Obj_width( A12 ) > 0 )
{
// ** Block update
FLA_Part_2x1( W12, &TT,
&TB, b, FLA_TOP );
FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_MINUS_ONE, A21, TT, FLA_ONE, A22 );
}
// ** Apply pivots to previous columns.
FLA_Apply_pivots( FLA_RIGHT, FLA_TRANSPOSE, p1, ATR );
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
A10, A11, /**/ A12,
/* ************** */ /* ****************** */
&ABL, /**/ &ABR, A20, A21, /**/ A22,
FLA_TL );
FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ W12,
FLA_LEFT );
FLA_Cont_with_3x1_to_2x1( &pT, p0,
//.........這裏部分代碼省略.........
示例2: FLASH_SA_LU
FLA_Error FLASH_SA_LU( FLA_Obj B, FLA_Obj C,
FLA_Obj D, FLA_Obj E, FLA_Obj p, FLA_Obj L, dim_t nb_alg, fla_lu_t* cntl )
{
FLA_Obj DT, D0,
DB, D1,
D2;
FLA_Obj ET, E0,
EB, E1,
E2;
FLA_Obj pT, p0,
pB, p1,
p2;
FLA_Obj LT, L0,
LB, L1,
L2;
FLA_Part_2x1( D, &DT,
&DB, 0, FLA_TOP );
FLA_Part_2x1( E, &ET,
&EB, 0, FLA_TOP );
FLA_Part_2x1( p, &pT,
&pB, 0, FLA_TOP );
FLA_Part_2x1( L, <,
&LB, 0, FLA_TOP );
while ( FLA_Obj_length( DT ) < FLA_Obj_length( D ) )
{
FLA_Repart_2x1_to_3x1( DT, &D0,
/* ** */ /* ** */
&D1,
DB, &D2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( ET, &E0,
/* ** */ /* ** */
&E1,
EB, &E2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( pT, &p0,
/* ** */ /* ** */
&p1,
pB, &p2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( LT, &L0,
/* ** */ /* ** */
&L1,
LB, &L2, 1, FLA_BOTTOM );
/*------------------------------------------------------------*/
if ( FLASH_Queue_get_enabled( ) )
{
// Enqueue
ENQUEUE_FLASH_SA_LU( *FLASH_OBJ_PTR_AT( B ),
*FLASH_OBJ_PTR_AT( D1 ),
*FLASH_OBJ_PTR_AT( p1 ),
*FLASH_OBJ_PTR_AT( L1 ),
nb_alg,
FLA_Cntl_sub_lu( cntl ) );
}
else
{
// Execute leaf
FLA_SA_LU_task( *FLASH_OBJ_PTR_AT( B ),
*FLASH_OBJ_PTR_AT( D1 ),
*FLASH_OBJ_PTR_AT( p1 ),
*FLASH_OBJ_PTR_AT( L1 ),
nb_alg,
FLA_Cntl_sub_lu( cntl ) );
}
FLASH_SA_FS( L1,
D1, p1, C,
E1, nb_alg, FLA_Cntl_sub_gemm1( cntl ) );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &DT, D0,
D1,
/* ** */ /* ** */
&DB, D2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &ET, E0,
E1,
/* ** */ /* ** */
&EB, E2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &pT, p0,
p1,
/* ** */ /* ** */
&pB, p2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( <, L0,
L1,
/* ** */ /* ** */
//.........這裏部分代碼省略.........
示例3: FLA_Her2k_ln_unb_var6
FLA_Error FLA_Her2k_ln_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C )
{
FLA_Obj AT, A0,
AB, a1t,
A2;
FLA_Obj BT, B0,
BB, b1t,
B2;
FLA_Obj CTL, CTR, C00, c01, C02,
CBL, CBR, c10t, gamma11, c12t,
C20, c21, C22;
FLA_Scalr_external( FLA_LOWER_TRIANGULAR, beta, C );
FLA_Part_2x1( A, &AT,
&AB, 0, FLA_BOTTOM );
FLA_Part_2x1( B, &BT,
&BB, 0, FLA_BOTTOM );
FLA_Part_2x2( C, &CTL, &CTR,
&CBL, &CBR, 0, 0, FLA_BR );
while ( FLA_Obj_length( AB ) < FLA_Obj_length( A ) ){
FLA_Repart_2x1_to_3x1( AT, &A0,
&a1t,
/* ** */ /* ** */
AB, &A2, 1, FLA_TOP );
FLA_Repart_2x1_to_3x1( BT, &B0,
&b1t,
/* ** */ /* ** */
BB, &B2, 1, FLA_TOP );
FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, &c01, /**/ &C02,
&c10t, &gamma11, /**/ &c12t,
/* ************* */ /* ************************** */
CBL, /**/ CBR, &C20, &c21, /**/ &C22,
1, 1, FLA_TL );
/*------------------------------------------------------------*/
/* c10t = c10t + a1t * B0' */
FLA_Gemv_external( FLA_CONJ_NO_TRANSPOSE, alpha, B0, a1t, FLA_ONE, c10t );
/* c21 = c21 + B2 * a1t' */
FLA_Gemvc_external( FLA_NO_TRANSPOSE, FLA_CONJUGATE, alpha, B2, a1t, FLA_ONE, c21 );
/* gamma11 = gamma11 + a1t * b1t' + b1t * a1t' */
FLA_Dot2cs_external( FLA_CONJUGATE, alpha, a1t, b1t, FLA_ONE, gamma11 );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &AT, A0,
/* ** */ /* ** */
a1t,
&AB, A2, FLA_BOTTOM );
FLA_Cont_with_3x1_to_2x1( &BT, B0,
/* ** */ /* ** */
b1t,
&BB, B2, FLA_BOTTOM );
FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, /**/ c01, C02,
/* ************** */ /* ************************ */
c10t, /**/ gamma11, c12t,
&CBL, /**/ &CBR, C20, /**/ c21, C22,
FLA_BR );
}
return FLA_SUCCESS;
}
示例4: FLA_UDdate_UT_blk_var2
FLA_Error FLA_UDdate_UT_blk_var2( FLA_Obj R,
FLA_Obj C,
FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl )
{
FLA_Obj CT, C0,
CB, C1,
C2;
FLA_Obj DT, D0,
DB, D1,
D2;
FLA_Obj TT, T0,
TB, T1,
T2;
dim_t b_C, b_D, b_T;
FLA_Part_2x1( C, &CT,
&CB, 0, FLA_TOP );
FLA_Part_2x1( D, &DT,
&DB, 0, FLA_TOP );
FLA_Part_2x1( T, &TT,
&TB, 0, FLA_TOP );
while ( FLA_Obj_length( CT ) < FLA_Obj_length( C ) &&
FLA_Obj_length( DT ) < FLA_Obj_length( D ) ) {
b_C = FLA_Determine_blocksize( CB, FLA_BOTTOM, FLA_Cntl_blocksize( cntl ) );
b_D = FLA_Determine_blocksize( DB, FLA_BOTTOM, FLA_Cntl_blocksize( cntl ) );
b_T = FLA_Determine_blocksize( TB, FLA_BOTTOM, FLA_Cntl_blocksize( cntl ) );
FLA_Repart_2x1_to_3x1( CT, &C0,
/* ** */ /* ****** */
&C1,
CB, &C2, b_C, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( DT, &D0,
/* ** */ /* ****** */
&D1,
DB, &D2, b_D, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( TT, &T0,
/* ** */ /* ****** */
&T1,
TB, &T2, b_T, FLA_BOTTOM );
/*------------------------------------------------------------*/
/*
Perform an up/downdate of the upper triangular Cholesky factor R via
"UD" UT Householder transformations:
[ R, ...
C1, ...
D1, T1 ] = FLA_UDdate_UT( R, ...
C1, ...
D1, T1 );
by updating R in such a way that removes the contributions of the rows
in D1 while simultaneously adding new contributions to the factorization
from the rows of C1. Note that C1 and D1 are also updated in the process.
Also note that either C1 or D1 may become empty at any iteration.
*/
FLA_UDdate_UT_internal( R,
C1,
D1, T1,
FLA_Cntl_sub_uddateut( cntl ) );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &CT, C0,
C1,
/* ** */ /* ****** */
&CB, C2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &DT, D0,
D1,
/* ** */ /* ****** */
&DB, D2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &TT, T0,
T1,
/* ** */ /* ****** */
&TB, T2, FLA_TOP );
}
return FLA_SUCCESS;
}
示例5: Symm_unb_var6
int Symm_unb_var6( FLA_Obj A, FLA_Obj B, FLA_Obj C )
{
FLA_Obj ATL, ATR, A00, a01, A02,
ABL, ABR, a10t, alpha11, a12t,
A20, a21, A22;
FLA_Obj BT, B0,
BB, b1t,
B2;
FLA_Obj CT, C0,
CB, c1t,
C2;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_2x1( B, &BT,
&BB, 0, FLA_TOP );
FLA_Part_2x1( C, &CT,
&CB, 0, FLA_TOP );
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02,
/* ************* */ /* ************************** */
&a10t, /**/ &alpha11, &a12t,
ABL, /**/ ABR, &A20, /**/ &a21, &A22,
1, 1, FLA_BR );
FLA_Repart_2x1_to_3x1( BT, &B0,
/* ** */ /* *** */
&b1t,
BB, &B2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( CT, &C0,
/* ** */ /* *** */
&c1t,
CB, &C2, 1, FLA_BOTTOM );
/*------------------------------------------------------------*/
// c1t = c1t + a10t*B0 + alpha11*b1t + a21t*B2;
FLA_Gemv(FLA_TRANSPOSE, FLA_ONE, B0, a10t, FLA_ONE, c1t);
FLA_Gemv(FLA_TRANSPOSE, FLA_ONE, B2, a21, FLA_ONE, c1t);
FLA_Axpy(alpha11, b1t, c1t);
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02,
a10t, alpha11, /**/ a12t,
/* ************** */ /* ************************ */
&ABL, /**/ &ABR, A20, a21, /**/ A22,
FLA_TL );
FLA_Cont_with_3x1_to_2x1( &BT, B0,
b1t,
/* ** */ /* *** */
&BB, B2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &CT, C0,
c1t,
/* ** */ /* *** */
&CB, C2, FLA_TOP );
}
return FLA_SUCCESS;
}
示例6: FLA_Syrk_ln_omp2l_var3
FLA_Error FLA_Syrk_ln_omp2l_var3( FLA_Obj A, FLA_Obj C )
{
FLA_Obj AT, A0,
AB, A1,
A2;
FLA_Obj CTL, CTR, C00, C01, C02,
CBL, CBR, C10, C11, C12,
C20, C21, C22;
int b;
FLA_Part_2x1( A, &AT,
&AB, 0, FLA_BOTTOM );
FLA_Part_2x2( C, &CTL, &CTR,
&CBL, &CBR, 0, 0, FLA_BR );
#pragma intel omp parallel taskq
{
while ( FLA_Obj_length( AB ) < FLA_Obj_length( A ) ){
b = FLA_Task_compute_blocksize( 0, A, AB, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( AT, &A0,
&A1,
/* ** */ /* ** */
AB, &A2, b, FLA_TOP );
FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, &C01, /**/ &C02,
&C10, &C11, /**/ &C12,
/* ************* */ /* ******************** */
CBL, /**/ CBR, &C20, &C21, /**/ &C22,
b, b, FLA_TL );
/*------------------------------------------------------------*/
#pragma intel omp task captureprivate(A1, A2, C21)
{
/* C21 = C21 + A2 * A1' */
FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A2, A1, FLA_ONE, C21 );
}
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &AT, A0,
/* ** */ /* ** */
A1,
&AB, A2, FLA_BOTTOM );
FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, /**/ C01, C02,
/* ************** */ /* ****************** */
C10, /**/ C11, C12,
&CBL, /**/ &CBR, C20, /**/ C21, C22,
FLA_BR );
}
FLA_Part_2x1( A, &AT,
&AB, 0, FLA_BOTTOM );
FLA_Part_2x2( C, &CTL, &CTR,
&CBL, &CBR, 0, 0, FLA_BR );
while ( FLA_Obj_length( AB ) < FLA_Obj_length( A ) ){
b = FLA_Task_compute_blocksize( 0, A, AB, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( AT, &A0,
&A1,
/* ** */ /* ** */
AB, &A2, b, FLA_TOP );
FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, &C01, /**/ &C02,
&C10, &C11, /**/ &C12,
/* ************* */ /* ******************** */
CBL, /**/ CBR, &C20, &C21, /**/ &C22,
b, b, FLA_TL );
/*------------------------------------------------------------*/
#pragma intel omp task captureprivate(C11, A1)
{
/* C11 = C11 + A1 * A1' */
FLA_Syrk_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A1, FLA_ONE, C11 );
}
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &AT, A0,
/* ** */ /* ** */
A1,
&AB, A2, FLA_BOTTOM );
FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, /**/ C01, C02,
/* ************** */ /* ****************** */
C10, /**/ C11, C12,
&CBL, /**/ &CBR, C20, /**/ C21, C22,
FLA_BR );
//.........這裏部分代碼省略.........
示例7: FLA_Trsv_un_blk_var1
FLA_Error FLA_Trsv_un_blk_var1( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl )
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj xT, x0,
xB, x1,
x2;
dim_t b;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_BR );
FLA_Part_2x1( x, &xT,
&xB, 0, FLA_BOTTOM );
while ( FLA_Obj_length( ABR ) < FLA_Obj_length( A ) ){
b = FLA_Determine_blocksize( ATL, FLA_TL, FLA_Cntl_blocksize( cntl ) );
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, &A01, /**/ &A02,
&A10, &A11, /**/ &A12,
/* ************* */ /* ******************** */
ABL, /**/ ABR, &A20, &A21, /**/ &A22,
b, b, FLA_TL );
FLA_Repart_2x1_to_3x1( xT, &x0,
&x1,
/* ** */ /* ** */
xB, &x2, b, FLA_TOP );
/*------------------------------------------------------------*/
/* x1 = x1 - A12 * x2; */
FLA_Gemv_internal( FLA_NO_TRANSPOSE,
FLA_MINUS_ONE, A12, x2, FLA_ONE, x1,
FLA_Cntl_sub_gemv( cntl ) );
/* x1 = triu( A11 ) \ x1; */
FLA_Trsv_internal( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE, diagA,
A11, x1,
FLA_Cntl_sub_trsv( cntl ) );
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, /**/ A01, A02,
/* ************** */ /* ****************** */
A10, /**/ A11, A12,
&ABL, /**/ &ABR, A20, /**/ A21, A22,
FLA_BR );
FLA_Cont_with_3x1_to_2x1( &xT, x0,
/* ** */ /* ** */
x1,
&xB, x2, FLA_BOTTOM );
}
return FLA_SUCCESS;
}
示例8: FLA_Apply_QUD_UT_lhfc_blk_var1
FLA_Error FLA_Apply_QUD_UT_lhfc_blk_var1( FLA_Obj T, FLA_Obj W,
FLA_Obj R,
FLA_Obj U, FLA_Obj C,
FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl )
{
FLA_Obj TL, TR, T0, T1, T2;
FLA_Obj UL, UR, U0, U1, U2;
FLA_Obj VL, VR, V0, V1, V2;
FLA_Obj RT, R0,
RB, R1,
R2;
FLA_Obj T1T,
T1B;
FLA_Obj W1TL, W1TR,
W1BL, W1BR;
dim_t b_alg, b;
// Query the algorithmic blocksize by inspecting the length of T.
b_alg = FLA_Obj_length( T );
FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT );
FLA_Part_1x2( U, &UL, &UR, 0, FLA_LEFT );
FLA_Part_1x2( V, &VL, &VR, 0, FLA_LEFT );
FLA_Part_2x1( R, &RT,
&RB, 0, FLA_TOP );
while ( FLA_Obj_width( UL ) < FLA_Obj_width( U ) ){
b = min( b_alg, FLA_Obj_width( UR ) );
FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &T2,
b, FLA_RIGHT );
FLA_Repart_1x2_to_1x3( UL, /**/ UR, &U0, /**/ &U1, &U2,
b, FLA_RIGHT );
FLA_Repart_1x2_to_1x3( VL, /**/ VR, &V0, /**/ &V1, &V2,
b, FLA_RIGHT );
FLA_Repart_2x1_to_3x1( RT, &R0,
/* ** */ /* ** */
&R1,
RB, &R2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
FLA_Part_2x1( T1, &T1T,
&T1B, b, FLA_TOP );
FLA_Part_2x2( W, &W1TL, &W1TR,
&W1BL, &W1BR, b, FLA_Obj_width( R1 ), FLA_TL );
// W1TL = R1;
FLA_Copyt_internal( FLA_NO_TRANSPOSE, R1, W1TL,
FLA_Cntl_sub_copyt( cntl ) );
// W1TL = inv( triu( T1T ) )' * ( R1 + U1' * C + V1' * D );
FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_ONE, U1, C, FLA_ONE, W1TL,
FLA_Cntl_sub_gemm1( cntl ) );
FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_ONE, V1, D, FLA_ONE, W1TL,
FLA_Cntl_sub_gemm2( cntl ) );
FLA_Trsm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG,
FLA_ONE, T1T, W1TL,
FLA_Cntl_sub_trsm( cntl ) );
// R1 = R1 - W1TL;
// C = C - U1 * W1TL;
// D = D + V1 * W1TL;
FLA_Axpyt_internal( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, W1TL, R1,
FLA_Cntl_sub_axpyt( cntl ) );
FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_MINUS_ONE, U1, W1TL, FLA_ONE, C,
FLA_Cntl_sub_gemm3( cntl ) );
FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_ONE, V1, W1TL, FLA_ONE, D,
FLA_Cntl_sub_gemm4( cntl ) );
/*------------------------------------------------------------*/
FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ T2,
FLA_LEFT );
//.........這裏部分代碼省略.........
示例9: Symm_blk_var8
int Symm_blk_var8( FLA_Obj A, FLA_Obj B, FLA_Obj C, int nb_alg )
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj BT, B0,
BB, B1,
B2;
FLA_Obj CT, C0,
CB, C1,
C2;
int b;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_2x1( B, &BT,
&BB, 0, FLA_TOP );
FLA_Part_2x1( C, &CT,
&CB, 0, FLA_TOP );
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){
b = min( FLA_Obj_length( ABR ), nb_alg );
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
/* ************* */ /* ******************** */
&A10, /**/ &A11, &A12,
ABL, /**/ ABR, &A20, /**/ &A21, &A22,
b, b, FLA_BR );
FLA_Repart_2x1_to_3x1( BT, &B0,
/* ** */ /* ** */
&B1,
BB, &B2, b, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( CT, &C0,
/* ** */ /* ** */
&C1,
CB, &C2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
// C0 = C0 + A10^T*B1;
FLA_Gemm(FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A10, B1, FLA_ONE, C0);
// C1 = C1 + A10*B0 + A11*B1;
FLA_Gemm(FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A10, B0, FLA_ONE, C1);
// FLA_Gemm(FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A11, B1, FLA_ONE, C1);
FLA_Symm(FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_ONE, A11, B1, FLA_ONE, C1);
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
A10, A11, /**/ A12,
/* ************** */ /* ****************** */
&ABL, /**/ &ABR, A20, A21, /**/ A22,
FLA_TL );
FLA_Cont_with_3x1_to_2x1( &BT, B0,
B1,
/* ** */ /* ** */
&BB, B2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &CT, C0,
C1,
/* ** */ /* ** */
&CB, C2, FLA_TOP );
}
return FLA_SUCCESS;
}
示例10: FLA_Gemm_pp_nn_var1
FLA_Error FLA_Gemm_pp_nn_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj C, int nb_alg )
{
FLA_Obj AT, A0,
AB, A1,
A2;
FLA_Obj CT, C0,
CB, C1,
C2;
FLA_Obj packed_C1;
int b;
FLA_Part_2x1( A, &AT,
&AB, 0, FLA_TOP );
FLA_Part_2x1( C, &CT,
&CB, 0, FLA_TOP );
/* Initialize the FLA_Gemm() interface to the kernel environment
Note: the blocksize given to the kernel environment can be non-
square. We pass the m and n dimensions of the blocksize here. */
FLA_Gemm_init( nb_alg, FLA_Obj_width( A ) );
/* Pack B */
/* Note: the idea here is that, optionally,
- B is packed, and/or
- B is scaled
If B needs not be packed, it is not packed. If the multiplication
by alpha happens elsewhere, no scaling occurs.
The "NoTranspose" means that in the version of gemm being updated
B is not transposed. In packing, B could be transposed, if there
is an advantage to this. So, the "NoTranspose" means that input
B is not transposed in the FLA_Gemm call. */
FLA_Gemm_pack_andor_scale_B( FLA_NO_TRANSPOSE, alpha, B );
while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){
b = min( FLA_Obj_length( AB ), nb_alg );
FLA_Repart_2x1_to_3x1( AT, &A0,
/* ** */ /* ** */
&A1,
AB, &A2, b, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( CT, &C0,
/* ** */ /* ** */
&C1,
CB, &C2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
/* C1 = alpha * A1 * B + C1; */
/* Pack C1 */
/* Note: the idea here is that, optionally,
- packed space is provided for computing packed_C1 = alpha * A1 * B
If C1 needs not be packed, then C can just be returned by this routine. */
FLA_Gemm_pack_C( FLA_NO_TRANSPOSE, C1 );
/* Pack A */
/* Note: the idea here is that, optionally,
- A is packed, and/or
- A is scaled
If A needs not be packed, it is not packed. If the multiplication
by alpha happens elsewhere, no scaling occurs. */
FLA_Gemm_pack_andor_scale_A( FLA_NO_TRANSPOSE, alpha, A1 );
/* Call the kernel routine */
FLA_Gemm_kernel( alpha, A1, B, C1 );
/* Unpack C1 */
/* Note: the idea here is that, optionally,
- packed_C1 is added to C1, possibly scaled at this point. */
FLA_Gemm_unpack_andor_scale_C( FLA_NO_TRANSPOSE, alpha, C1 );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &AT, A0,
A1,
/* ** */ /* ** */
&AB, A2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &CT, C0,
C1,
/* ** */ /* ** */
&CB, C2, FLA_TOP );
}
/* Release the space used to pack A1 */
/* Note: notice that the space provided for A1 can be recycled
everytime through the loop, which is why this call is outside the loop.
If the space is statically allocated, or A1 was not packed,
this could be a no-op. */
FLA_Gemm_release_pack_A( FLA_NO_TRANSPOSE, A1 );
//.........這裏部分代碼省略.........
示例11: FLA_Syr2k_un_unb_var4
FLA_Error FLA_Syr2k_un_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C )
{
FLA_Obj AT, A0,
AB, a1t,
A2;
FLA_Obj BT, B0,
BB, b1t,
B2;
FLA_Obj CTL, CTR, C00, c01, C02,
CBL, CBR, c10t, gamma11, c12t,
C20, c21, C22;
FLA_Scalr_external( FLA_UPPER_TRIANGULAR, beta, C );
FLA_Part_2x1( A, &AT,
&AB, 0, FLA_TOP );
FLA_Part_2x1( B, &BT,
&BB, 0, FLA_TOP );
FLA_Part_2x2( C, &CTL, &CTR,
&CBL, &CBR, 0, 0, FLA_TL );
while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ) {
FLA_Repart_2x1_to_3x1( AT, &A0,
/* ** */ /* ** */
&a1t,
AB, &A2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( BT, &B0,
/* ** */ /* ** */
&b1t,
BB, &B2, 1, FLA_BOTTOM );
FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, /**/ &c01, &C02,
/* ************* */ /* ************************** */
&c10t, /**/ &gamma11, &c12t,
CBL, /**/ CBR, &C20, /**/ &c21, &C22,
1, 1, FLA_BR );
/*------------------------------------------------------------*/
/* c01 = c01 + A0 * b1t' */
FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, A0, b1t, FLA_ONE, c01 );
/* c01 = c01 + B0 * a1t' */
FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, B0, a1t, FLA_ONE, c01 );
/* gamma11 = gamma11 + a1t * b1t' + b1t * a1t' */
FLA_Dot2s_external( alpha, a1t, b1t, FLA_ONE, gamma11 );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &AT, A0,
a1t,
/* ** */ /* ** */
&AB, A2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &BT, B0,
b1t,
/* ** */ /* ** */
&BB, B2, FLA_TOP );
FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, c01, /**/ C02,
c10t, gamma11, /**/ c12t,
/* ************** */ /* ************************ */
&CBL, /**/ &CBR, C20, c21, /**/ C22,
FLA_TL );
}
return FLA_SUCCESS;
}
示例12: FLASH_Axpy_hierarchy
FLA_Error FLASH_Axpy_hierarchy( int direction, FLA_Obj alpha, FLA_Obj F, FLA_Obj* H )
{
// Once we get down to a submatrix whose elements are scalars, we are down
// to our base case.
if ( FLA_Obj_elemtype( *H ) == FLA_SCALAR )
{
// Depending on which top-level function invoked us, we either axpy
// the source data in the flat matrix to the leaf-level submatrix of
// the hierarchical matrix, or axpy the data in the hierarchical
// submatrix to the flat matrix.
if ( direction == FLA_FLAT_TO_HIER )
{
#ifdef FLA_ENABLE_SCC
if ( FLA_is_owner() )
#endif
FLA_Axpy_external( alpha, F, *H );
}
else if ( direction == FLA_HIER_TO_FLAT )
{
#ifdef FLA_ENABLE_SCC
if ( FLA_is_owner() )
#endif
FLA_Axpy_external( alpha, *H, F );
}
}
else
{
FLA_Obj HL, HR, H0, H1, H2;
FLA_Obj FL, FR, F0, F1, F2;
FLA_Obj H1T, H01,
H1B, H11,
H21;
FLA_Obj F1T, F01,
F1B, F11,
F21;
dim_t b_m;
dim_t b_n;
FLA_Part_1x2( *H, &HL, &HR, 0, FLA_LEFT );
FLA_Part_1x2( F, &FL, &FR, 0, FLA_LEFT );
while ( FLA_Obj_width( HL ) < FLA_Obj_width( *H ) )
{
FLA_Repart_1x2_to_1x3( HL, /**/ HR, &H0, /**/ &H1, &H2,
1, FLA_RIGHT );
// Get the scalar width of H1 and use that to determine the
// width of F1.
b_n = FLASH_Obj_scalar_width( H1 );
FLA_Repart_1x2_to_1x3( FL, /**/ FR, &F0, /**/ &F1, &F2,
b_n, FLA_RIGHT );
// -------------------------------------------------------------
FLA_Part_2x1( H1, &H1T,
&H1B, 0, FLA_TOP );
FLA_Part_2x1( F1, &F1T,
&F1B, 0, FLA_TOP );
while ( FLA_Obj_length( H1T ) < FLA_Obj_length( H1 ) )
{
FLA_Repart_2x1_to_3x1( H1T, &H01,
/* ** */ /* *** */
&H11,
H1B, &H21, 1, FLA_BOTTOM );
// Get the scalar length of H11 and use that to determine the
// length of F11.
b_m = FLASH_Obj_scalar_length( H11 );
FLA_Repart_2x1_to_3x1( F1T, &F01,
/* ** */ /* *** */
&F11,
F1B, &F21, b_m, FLA_BOTTOM );
// -------------------------------------------------------------
// Recursively axpy between F11 and H11.
FLASH_Axpy_hierarchy( direction, alpha, F11,
FLASH_OBJ_PTR_AT( H11 ) );
// -------------------------------------------------------------
FLA_Cont_with_3x1_to_2x1( &H1T, H01,
H11,
/* ** */ /* *** */
&H1B, H21, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &F1T, F01,
F11,
/* ** */ /* *** */
&F1B, F21, FLA_TOP );
}
// -------------------------------------------------------------
FLA_Cont_with_1x3_to_1x2( &HL, /**/ &HR, H0, H1, /**/ H2,
FLA_LEFT );
FLA_Cont_with_1x3_to_1x2( &FL, /**/ &FR, F0, F1, /**/ F2,
//.........這裏部分代碼省略.........
示例13: FLA_Sylv_nh_blk_var16
FLA_Error FLA_Sylv_nh_blk_var16( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl )
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj CT, C0,
CB, C1,
C2;
dim_t b;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_BR );
FLA_Part_2x1( C, &CT,
&CB, 0, FLA_BOTTOM );
while ( FLA_Obj_length( ABR ) < FLA_Obj_length( A ) ){
b = FLA_Determine_blocksize( CT, FLA_TOP, FLA_Cntl_blocksize( cntl ) );
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, &A01, /**/ &A02,
&A10, &A11, /**/ &A12,
/* ************* */ /* ******************** */
ABL, /**/ ABR, &A20, &A21, /**/ &A22,
b, b, FLA_TL );
FLA_Repart_2x1_to_3x1( CT, &C0,
&C1,
/* ** */ /* ** */
CB, &C2, b, FLA_TOP );
// Loop Invariant:
// CT = CT - ATR * sylv( ABR, B', CB )
// CB = sylv( ABR, B', CB )
/*------------------------------------------------------------*/
// C1 = sylv( A11, B', C1 );
FLA_Sylv_internal( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
isgn, A11, B, C1, scale,
FLA_Cntl_sub_sylv1( cntl ) );
// C0 = C0 - A01 * C1;
FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_MINUS_ONE, A01, C1, FLA_ONE, C0,
FLA_Cntl_sub_gemm1( cntl ) );
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, /**/ A01, A02,
/* ************** */ /* ****************** */
A10, /**/ A11, A12,
&ABL, /**/ &ABR, A20, /**/ A21, A22,
FLA_BR );
FLA_Cont_with_3x1_to_2x1( &CT, C0,
/* ** */ /* ** */
C1,
&CB, C2, FLA_BOTTOM );
}
return FLA_SUCCESS;
}
示例14: Symm_blk_var1
int Symm_blk_var1( FLA_Obj A, FLA_Obj B, FLA_Obj C, int nb_alg )
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj BT, B0,
BB, B1,
B2;
FLA_Obj CT, C0,
CB, C1,
C2;
int b;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_2x1( B, &BT,
&BB, 0, FLA_TOP );
FLA_Part_2x1( C, &CT,
&CB, 0, FLA_TOP );
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){
b = min( FLA_Obj_length( ABR ), nb_alg );
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
/* ************* */ /* ******************** */
&A10, /**/ &A11, &A12,
ABL, /**/ ABR, &A20, /**/ &A21, &A22,
b, b, FLA_BR );
FLA_Repart_2x1_to_3x1( BT, &B0,
/* ** */ /* ** */
&B1,
BB, &B2, b, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( CT, &C0,
/* ** */ /* ** */
&C1,
CB, &C2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
C1 = C1 + A10*B0 + A11*B1;
C0 = C0 + A10*B1;
/* update line 1 */
/* : */
/* update line n */
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
A10, A11, /**/ A12,
/* ************** */ /* ****************** */
&ABL, /**/ &ABR, A20, A21, /**/ A22,
FLA_TL );
FLA_Cont_with_3x1_to_2x1( &BT, B0,
B1,
/* ** */ /* ** */
&BB, B2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &CT, C0,
C1,
/* ** */ /* ** */
&CB, C2, FLA_TOP );
}
return FLA_SUCCESS;
}
示例15: FLA_Syrk_un_blk_var3
FLA_Error FLA_Syrk_un_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl )
{
FLA_Obj AT, A0,
AB, A1,
A2;
FLA_Obj CTL, CTR, C00, C01, C02,
CBL, CBR, C10, C11, C12,
C20, C21, C22;
dim_t b;
FLA_Part_2x1( A, &AT,
&AB, 0, FLA_BOTTOM );
FLA_Part_2x2( C, &CTL, &CTR,
&CBL, &CBR, 0, 0, FLA_BR );
while ( FLA_Obj_length( AB ) < FLA_Obj_length( A ) ){
b = FLA_Determine_blocksize( AT, FLA_TOP, FLA_Cntl_blocksize( cntl ) );
FLA_Repart_2x1_to_3x1( AT, &A0,
&A1,
/* ** */ /* ** */
AB, &A2, b, FLA_TOP );
FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, &C01, /**/ &C02,
&C10, &C11, /**/ &C12,
/* ************* */ /* ******************** */
CBL, /**/ CBR, &C20, &C21, /**/ &C22,
b, b, FLA_TL );
/*------------------------------------------------------------*/
/* C12 = C12 + A1 * A2' */
FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_TRANSPOSE,
alpha, A1, A2, beta, C12,
FLA_Cntl_sub_gemm( cntl ) );
/* C11 = C11 + A1 * A1' */
FLA_Syrk_internal( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
alpha, A1, beta, C11,
FLA_Cntl_sub_syrk( cntl ) );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &AT, A0,
/* ** */ /* ** */
A1,
&AB, A2, FLA_BOTTOM );
FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, /**/ C01, C02,
/* ************** */ /* ****************** */
C10, /**/ C11, C12,
&CBL, /**/ &CBR, C20, /**/ C21, C22,
FLA_BR );
}
return FLA_SUCCESS;
}