本文整理汇总了C++中FLA_Cont_with_3x1_to_2x1函数的典型用法代码示例。如果您正苦于以下问题:C++ FLA_Cont_with_3x1_to_2x1函数的具体用法?C++ FLA_Cont_with_3x1_to_2x1怎么用?C++ FLA_Cont_with_3x1_to_2x1使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了FLA_Cont_with_3x1_to_2x1函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: FLA_QR_UT_piv_blk_var2
FLA_Error FLA_QR_UT_piv_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p, fla_qrut_t* cntl )
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj TL, TR, T0, T1, W12;
FLA_Obj TT, TB;
FLA_Obj pT, p0,
pB, p1,
p2;
FLA_Obj wT, w0,
wB, w1,
w2;
dim_t b_alg, b;
// Query the algorithmic blocksize by inspecting the length of T.
b_alg = FLA_Obj_length( T );
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT );
FLA_Part_2x1( p, &pT,
&pB, 0, FLA_TOP );
FLA_Part_2x1( w, &wT,
&wB, 0, FLA_TOP );
while ( FLA_Obj_min_dim( ABR ) > 0 ){
b = min( b_alg, FLA_Obj_min_dim( ABR ) );
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
/* ************* */ /* ******************** */
&A10, /**/ &A11, &A12,
ABL, /**/ ABR, &A20, /**/ &A21, &A22,
b, b, FLA_BR );
FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &W12,
b, FLA_RIGHT );
FLA_Repart_2x1_to_3x1( pT, &p0,
/* ** */ /* ** */
&p1,
pB, &p2, b, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( wT, &w0,
/* ** */ /* ** */
&w1,
wB, &w2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
// ** Reshape T matrices to match the blocksize b
FLA_Part_2x1( TR, &TT,
&TB, b, FLA_TOP );
// ** Perform a unblocked (BLAS2-oriented) QR factorization
// with pivoting via the UT transform on ABR:
//
// ABR -> QB1 R11
//
// where:
// - QB1 is formed from UB1 (which is stored column-wise below the
// diagonal of ( A11 A21 )^T and the upper-triangle of T1.
// - R11 is stored to ( A11 A12 ).
// - W12 stores T and partial updates for FLA_Apply_Q_UT_piv_var.
FLA_QR_UT_piv_internal( ABR, TT, wB, p1,
FLA_Cntl_sub_qrut( cntl ) );
if ( FLA_Obj_width( A12 ) > 0 )
{
// ** Block update
FLA_Part_2x1( W12, &TT,
&TB, b, FLA_TOP );
FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_MINUS_ONE, A21, TT, FLA_ONE, A22 );
}
// ** Apply pivots to previous columns.
FLA_Apply_pivots( FLA_RIGHT, FLA_TRANSPOSE, p1, ATR );
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
A10, A11, /**/ A12,
/* ************** */ /* ****************** */
&ABL, /**/ &ABR, A20, A21, /**/ A22,
FLA_TL );
FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ W12,
FLA_LEFT );
FLA_Cont_with_3x1_to_2x1( &pT, p0,
//.........这里部分代码省略.........
示例2: FLA_Her2k_ln_blk_var4
FLA_Error FLA_Her2k_ln_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl )
{
FLA_Obj AT, A0,
AB, A1,
A2;
FLA_Obj BT, B0,
BB, B1,
B2;
FLA_Obj CTL, CTR, C00, C01, C02,
CBL, CBR, C10, C11, C12,
C20, C21, C22;
dim_t b;
FLA_Scalr_internal( FLA_LOWER_TRIANGULAR, beta, C,
FLA_Cntl_sub_scalr( cntl ) );
FLA_Part_2x1( A, &AT,
&AB, 0, FLA_TOP );
FLA_Part_2x1( B, &BT,
&BB, 0, FLA_TOP );
FLA_Part_2x2( C, &CTL, &CTR,
&CBL, &CBR, 0, 0, FLA_TL );
while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){
b = FLA_Determine_blocksize( AB, FLA_BOTTOM, FLA_Cntl_blocksize( cntl ) );
FLA_Repart_2x1_to_3x1( AT, &A0,
/* ** */ /* ** */
&A1,
AB, &A2, b, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( BT, &B0,
/* ** */ /* ** */
&B1,
BB, &B2, b, FLA_BOTTOM );
FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, /**/ &C01, &C02,
/* ************* */ /* ******************** */
&C10, /**/ &C11, &C12,
CBL, /**/ CBR, &C20, /**/ &C21, &C22,
b, b, FLA_BR );
/*------------------------------------------------------------*/
/* C21 = C21 + A2 * B1' */
FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
alpha, A2, B1, FLA_ONE, C21,
FLA_Cntl_sub_gemm1( cntl ) );
/* C21 = C21 + B2 * A1' */
FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
alpha, B2, A1, FLA_ONE, C21,
FLA_Cntl_sub_gemm2( cntl ) );
/* C11 = C11 + A1 * B1' + B1 * A1' */
FLA_Her2k_internal( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE,
alpha, A1, B1, FLA_ONE, C11,
FLA_Cntl_sub_her2k( cntl ) );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &AT, A0,
A1,
/* ** */ /* ** */
&AB, A2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &BT, B0,
B1,
/* ** */ /* ** */
&BB, B2, FLA_TOP );
FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, C01, /**/ C02,
C10, C11, /**/ C12,
/* ************** */ /* ****************** */
&CBL, /**/ &CBR, C20, C21, /**/ C22,
FLA_TL );
}
return FLA_SUCCESS;
}
示例3: FLA_Syrk_ln_omp1t_var3
FLA_Error FLA_Syrk_ln_omp1t_var3( FLA_Obj A, FLA_Obj C )
{
FLA_Obj AT, A0,
AB, A1,
A2;
FLA_Obj CTL, CTR, C00, C01, C02,
CBL, CBR, C10, C11, C12,
C20, C21, C22;
int b;
FLA_Part_2x1( A, &AT,
&AB, 0, FLA_BOTTOM );
FLA_Part_2x2( C, &CTL, &CTR,
&CBL, &CBR, 0, 0, FLA_BR );
#pragma intel omp parallel taskq
{
while ( FLA_Obj_length( AB ) < FLA_Obj_length( A ) ){
b = FLA_Task_compute_blocksize( 0, A, AB, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( AT, &A0,
&A1,
/* ** */ /* ** */
AB, &A2, b, FLA_TOP );
FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, &C01, /**/ &C02,
&C10, &C11, /**/ &C12,
/* ************* */ /* ******************** */
CBL, /**/ CBR, &C20, &C21, /**/ &C22,
b, b, FLA_TL );
/*------------------------------------------------------------*/
#pragma intel omp task captureprivate(C11, A1, A2, C21)
{
/* C21 = C21 + A2 * A1' */
FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_TRANSPOSE, FLA_ONE, A2, A1, FLA_ONE, C21 );
/* C11 = C11 + A1 * A1' */
FLA_Syrk_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_ONE, A1, FLA_ONE, C11 );
}
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &AT, A0,
/* ** */ /* ** */
A1,
&AB, A2, FLA_BOTTOM );
FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, /**/ C01, C02,
/* ************** */ /* ****************** */
C10, /**/ C11, C12,
&CBL, /**/ &CBR, C20, /**/ C21, C22,
FLA_BR );
}
}
return FLA_SUCCESS;
}
示例4: FLA_Trmm_llt_blk_var1
FLA_Error FLA_Trmm_llt_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl )
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj BT, B0,
BB, B1,
B2;
dim_t b;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_2x1( B, &BT,
&BB, 0, FLA_TOP );
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){
b = FLA_Determine_blocksize( ABR, FLA_BR, FLA_Cntl_blocksize( cntl ) );
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
/* ************* */ /* ******************** */
&A10, /**/ &A11, &A12,
ABL, /**/ ABR, &A20, /**/ &A21, &A22,
b, b, FLA_BR );
FLA_Repart_2x1_to_3x1( BT, &B0,
/* ** */ /* ** */
&B1,
BB, &B2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
/* B1 = tril( A11 )' * B1; */
FLA_Trmm_internal( FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_TRANSPOSE, diagA,
alpha, A11, B1,
FLA_Cntl_sub_trmm( cntl ) );
/* B1 = B1 + A21' * B2; */
FLA_Gemm_internal( FLA_TRANSPOSE, FLA_NO_TRANSPOSE,
alpha, A21, B2, FLA_ONE, B1,
FLA_Cntl_sub_gemm( cntl ) );
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
A10, A11, /**/ A12,
/* ************** */ /* ****************** */
&ABL, /**/ &ABR, A20, A21, /**/ A22,
FLA_TL );
FLA_Cont_with_3x1_to_2x1( &BT, B0,
B1,
/* ** */ /* ** */
&BB, B2, FLA_TOP );
}
return FLA_SUCCESS;
}
示例5: FLA_Symm_lu_unb_var1
FLA_Error FLA_Symm_lu_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C )
{
FLA_Obj ATL, ATR, A00, a01, A02,
ABL, ABR, a10t, alpha11, a12t,
A20, a21, A22;
FLA_Obj BT, B0,
BB, b1t,
B2;
FLA_Obj CT, C0,
CB, c1t,
C2;
FLA_Scal_external( beta, C );
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_2x1( B, &BT,
&BB, 0, FLA_TOP );
FLA_Part_2x1( C, &CT,
&CB, 0, FLA_TOP );
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02,
/* ************* */ /* ************************** */
&a10t, /**/ &alpha11, &a12t,
ABL, /**/ ABR, &A20, /**/ &a21, &A22,
1, 1, FLA_BR );
FLA_Repart_2x1_to_3x1( BT, &B0,
/* ** */ /* ** */
&b1t,
BB, &B2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( CT, &C0,
/* ** */ /* ** */
&c1t,
CB, &C2, 1, FLA_BOTTOM );
/*------------------------------------------------------------*/
/* C0 = C0 + a01 * b1t */
FLA_Ger_external( alpha, a01, b1t, C0 );
/* c1t = c1t + a01' * B0 */
/* c1t' = c1t' + B0' * a01 */
FLA_Gemv_external( FLA_TRANSPOSE, alpha, B0, a01, FLA_ONE, c1t );
/* c1t = c1t + alpha11 * b1t */
FLA_Axpys_external( alpha, alpha11, b1t, FLA_ONE, c1t );
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02,
a10t, alpha11, /**/ a12t,
/* ************** */ /* ************************ */
&ABL, /**/ &ABR, A20, a21, /**/ A22,
FLA_TL );
FLA_Cont_with_3x1_to_2x1( &BT, B0,
b1t,
/* ** */ /* ** */
&BB, B2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &CT, C0,
c1t,
/* ** */ /* ** */
&CB, C2, FLA_TOP );
}
return FLA_SUCCESS;
}
示例6: FLA_Gemm_nn_omp_var15
//.........这里部分代码省略.........
{
while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) )
{
b_m = FLA_Determine_blocksize( A, AT, FLA_TOP, FLA_Cntl_blocksize( cntl ) );
FLA_Repart_2x1_to_3x1( AT, &A0,
/* ** */ /* ** */
&A1,
AB, &A2, b_m, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( CT, &C0,
/* ** */ /* ** */
&C1,
CB, &C2, b_m, FLA_BOTTOM );
/*------------------------------------------------------------*/
/* C1 = alpha * A1 * B + C1; */
FLA_Part_1x2( A1, &AL, &AR, 0, FLA_LEFT );
FLA_Part_2x1( B, &BT,
&BB, 0, FLA_TOP );
while ( FLA_Obj_width( AL ) < FLA_Obj_width( A ) )
{
b_k = FLA_Determine_blocksize( A, AL, FLA_LEFT, FLA_Cntl_blocksize( cntl ) );
// Get the index of the current partition.
// FIX THIS: need + b_m - 1 or something like this
//j = FLA_Obj_length( CT ) / b_m;
//i = FLA_Obj_width( AL ) / b_k;
//lock_ldim = FLA_get_num_threads_in_m_dim(omp_get_num_threads());
lock_i = FLA_Obj_length( CT ) / b_m;
FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A10, /**/ &A11, &A12,
b_k, FLA_RIGHT );
FLA_Repart_2x1_to_3x1( BT, &B0,
/* ** */ /* ** */
&B1,
BB, &B2, b_k, FLA_BOTTOM );
/*------------------------------------------------------------*/
/* C1 = alpha * A11 * B1 + C1; */
//// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
//// alpha, A11, B1, FLA_ONE, C1 );
#pragma intel omp task captureprivate( lock_i, A11, B1, C1 ), private( C1_local )
{
FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, C1, &C1_local );
FLA_Obj_set_to_zero( C1_local );
/* C1_local = alpha * A1 * B11 + C1_local; */
FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
alpha, A11, B1, FLA_ONE, C1_local );
// Acquire lock[i] (the lock for C1).
omp_set_lock( &fla_omp_lock[lock_i] );
/* C1 += C1_local */
FLA_Axpy_external( FLA_ONE, C1_local, C1 );
//FLA_Axpy_sync_pipeline2( j*lock_ldim, FLA_ONE, C1_local, C1 );
//FLA_Axpy_sync_circular2( j*lock_ldim, i, FLA_ONE, C1_local, C1 );
//REF_Axpy_sync_circular2( j*lock_ldim, i, FLA_ONE, C1_local, C1 );
// Release lock[i] (the lock for C1).
omp_unset_lock( &fla_omp_lock[lock_i] );
FLA_Obj_free( &C1_local );
}
/*------------------------------------------------------------*/
FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A10, A11, /**/ A12,
FLA_LEFT );
FLA_Cont_with_3x1_to_2x1( &BT, B0,
B1,
/* ** */ /* ** */
&BB, B2, FLA_TOP );
}
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &AT, A0,
A1,
/* ** */ /* ** */
&AB, A2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &CT, C0,
C1,
/* ** */ /* ** */
&CB, C2, FLA_TOP );
}
}
return FLA_SUCCESS;
}
示例7: FLA_LU_piv_unb_var3
FLA_Error FLA_LU_piv_unb_var3( FLA_Obj A, FLA_Obj p )
{
FLA_Obj ATL, ATR, A00, a01, A02,
ABL, ABR, a10t, alpha11, a12t,
A20, a21, A22;
FLA_Obj AL, AR, A0, a1, A2;
FLA_Obj pT, p0,
pB, pi1,
p2;
FLA_Obj AB0, aB1;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_1x2( A, &AL, &AR, 0, FLA_LEFT );
FLA_Part_2x1( p, &pT,
&pB, 0, FLA_TOP );
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) &&
FLA_Obj_width( ATL ) < FLA_Obj_width( A )){
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &a01, &A02,
/* ************* */ /* ************************** */
&a10t, /**/ &alpha11, &a12t,
ABL, /**/ ABR, &A20, /**/ &a21, &A22,
1, 1, FLA_BR );
FLA_Repart_1x2_to_1x3( AL, /**/ AR, &A0, /**/ &a1, &A2,
1, FLA_RIGHT );
FLA_Repart_2x1_to_3x1( pT, &p0,
/* ** */ /* *** */
&pi1,
pB, &p2, 1, FLA_BOTTOM );
/*------------------------------------------------------------*/
// Apply previously computed pivots
FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, p0, a1 );
// a01 = trilu( A00 ) \ a01
FLA_Trsv_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, A00, a01 );
// alpha11 = alpha11 - a10t * a01
FLA_Dots_external( FLA_MINUS_ONE, a10t, a01, FLA_ONE, alpha11 );
// a21 = a21 - A20 * a01
FLA_Gemv_external( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, A20, a01, FLA_ONE, a21 );
// aB1 = / alpha11 \
// \ a21 /
FLA_Merge_2x1( alpha11,
a21, &aB1 );
// Determine pivot index
FLA_Amax_external( aB1, pi1 );
// Apply pivots to current column
FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, pi1, aB1 );
// a21 = a21 / alpha11
FLA_Inv_scal_external( alpha11, a21 );
// AB0 = / a10t \
// \ A20 /
FLA_Merge_2x1( a10t,
A20, &AB0 );
// Apply pivots to previous columns
FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, pi1, AB0 );
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, a01, /**/ A02,
a10t, alpha11, /**/ a12t,
/* ************** */ /* ************************ */
&ABL, /**/ &ABR, A20, a21, /**/ A22,
FLA_TL );
FLA_Cont_with_1x3_to_1x2( &AL, /**/ &AR, A0, a1, /**/ A2,
FLA_LEFT );
FLA_Cont_with_3x1_to_2x1( &pT, p0,
pi1,
/* ** */ /* *** */
&pB, p2, FLA_TOP );
}
if ( FLA_Obj_width( ATR ) > 0 )
{
/* Apply pivots to untouched columns */
FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE, p, ATR );
/* ATR = trilu( ATL ) \ ATR */
//.........这里部分代码省略.........
示例8: FLA_Apply_QUD_UT_lhfc_blk_var1
//.........这里部分代码省略.........
FLA_Obj T1T,
T1B;
FLA_Obj W1TL, W1TR,
W1BL, W1BR;
dim_t b_alg, b;
// Query the algorithmic blocksize by inspecting the length of T.
b_alg = FLA_Obj_length( T );
FLA_Part_1x2( T, &TL, &TR, 0, FLA_LEFT );
FLA_Part_1x2( U, &UL, &UR, 0, FLA_LEFT );
FLA_Part_1x2( V, &VL, &VR, 0, FLA_LEFT );
FLA_Part_2x1( R, &RT,
&RB, 0, FLA_TOP );
while ( FLA_Obj_width( UL ) < FLA_Obj_width( U ) ){
b = min( b_alg, FLA_Obj_width( UR ) );
FLA_Repart_1x2_to_1x3( TL, /**/ TR, &T0, /**/ &T1, &T2,
b, FLA_RIGHT );
FLA_Repart_1x2_to_1x3( UL, /**/ UR, &U0, /**/ &U1, &U2,
b, FLA_RIGHT );
FLA_Repart_1x2_to_1x3( VL, /**/ VR, &V0, /**/ &V1, &V2,
b, FLA_RIGHT );
FLA_Repart_2x1_to_3x1( RT, &R0,
/* ** */ /* ** */
&R1,
RB, &R2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
FLA_Part_2x1( T1, &T1T,
&T1B, b, FLA_TOP );
FLA_Part_2x2( W, &W1TL, &W1TR,
&W1BL, &W1BR, b, FLA_Obj_width( R1 ), FLA_TL );
// W1TL = R1;
FLA_Copyt_internal( FLA_NO_TRANSPOSE, R1, W1TL,
FLA_Cntl_sub_copyt( cntl ) );
// W1TL = inv( triu( T1T ) )' * ( R1 + U1' * C + V1' * D );
FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_ONE, U1, C, FLA_ONE, W1TL,
FLA_Cntl_sub_gemm1( cntl ) );
FLA_Gemm_internal( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_ONE, V1, D, FLA_ONE, W1TL,
FLA_Cntl_sub_gemm2( cntl ) );
FLA_Trsm_internal( FLA_LEFT, FLA_UPPER_TRIANGULAR,
FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG,
FLA_ONE, T1T, W1TL,
FLA_Cntl_sub_trsm( cntl ) );
// R1 = R1 - W1TL;
// C = C - U1 * W1TL;
// D = D + V1 * W1TL;
FLA_Axpyt_internal( FLA_NO_TRANSPOSE, FLA_MINUS_ONE, W1TL, R1,
FLA_Cntl_sub_axpyt( cntl ) );
FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_MINUS_ONE, U1, W1TL, FLA_ONE, C,
FLA_Cntl_sub_gemm3( cntl ) );
FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_ONE, V1, W1TL, FLA_ONE, D,
FLA_Cntl_sub_gemm4( cntl ) );
/*------------------------------------------------------------*/
FLA_Cont_with_1x3_to_1x2( &TL, /**/ &TR, T0, T1, /**/ T2,
FLA_LEFT );
FLA_Cont_with_1x3_to_1x2( &UL, /**/ &UR, U0, U1, /**/ U2,
FLA_LEFT );
FLA_Cont_with_1x3_to_1x2( &VL, /**/ &VR, V0, V1, /**/ V2,
FLA_LEFT );
FLA_Cont_with_3x1_to_2x1( &RT, R0,
R1,
/* ** */ /* ** */
&RB, R2, FLA_TOP );
}
return FLA_SUCCESS;
}
示例9: Symm_blk_var8
int Symm_blk_var8( FLA_Obj A, FLA_Obj B, FLA_Obj C, int nb_alg )
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj BT, B0,
BB, B1,
B2;
FLA_Obj CT, C0,
CB, C1,
C2;
int b;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_2x1( B, &BT,
&BB, 0, FLA_TOP );
FLA_Part_2x1( C, &CT,
&CB, 0, FLA_TOP );
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){
b = min( FLA_Obj_length( ABR ), nb_alg );
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
/* ************* */ /* ******************** */
&A10, /**/ &A11, &A12,
ABL, /**/ ABR, &A20, /**/ &A21, &A22,
b, b, FLA_BR );
FLA_Repart_2x1_to_3x1( BT, &B0,
/* ** */ /* ** */
&B1,
BB, &B2, b, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( CT, &C0,
/* ** */ /* ** */
&C1,
CB, &C2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
// C0 = C0 + A10^T*B1;
FLA_Gemm(FLA_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A10, B1, FLA_ONE, C0);
// C1 = C1 + A10*B0 + A11*B1;
FLA_Gemm(FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A10, B0, FLA_ONE, C1);
// FLA_Gemm(FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, FLA_ONE, A11, B1, FLA_ONE, C1);
FLA_Symm(FLA_LEFT, FLA_LOWER_TRIANGULAR, FLA_ONE, A11, B1, FLA_ONE, C1);
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
A10, A11, /**/ A12,
/* ************** */ /* ****************** */
&ABL, /**/ &ABR, A20, A21, /**/ A22,
FLA_TL );
FLA_Cont_with_3x1_to_2x1( &BT, B0,
B1,
/* ** */ /* ** */
&BB, B2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &CT, C0,
C1,
/* ** */ /* ** */
&CB, C2, FLA_TOP );
}
return FLA_SUCCESS;
}
示例10: FLA_Gemm_pp_nn_var1
FLA_Error FLA_Gemm_pp_nn_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj C, int nb_alg )
{
FLA_Obj AT, A0,
AB, A1,
A2;
FLA_Obj CT, C0,
CB, C1,
C2;
FLA_Obj packed_C1;
int b;
FLA_Part_2x1( A, &AT,
&AB, 0, FLA_TOP );
FLA_Part_2x1( C, &CT,
&CB, 0, FLA_TOP );
/* Initialize the FLA_Gemm() interface to the kernel environment
Note: the blocksize given to the kernel environment can be non-
square. We pass the m and n dimensions of the blocksize here. */
FLA_Gemm_init( nb_alg, FLA_Obj_width( A ) );
/* Pack B */
/* Note: the idea here is that, optionally,
- B is packed, and/or
- B is scaled
If B needs not be packed, it is not packed. If the multiplication
by alpha happens elsewhere, no scaling occurs.
The "NoTranspose" means that in the version of gemm being updated
B is not transposed. In packing, B could be transposed, if there
is an advantage to this. So, the "NoTranspose" means that input
B is not transposed in the FLA_Gemm call. */
FLA_Gemm_pack_andor_scale_B( FLA_NO_TRANSPOSE, alpha, B );
while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ){
b = min( FLA_Obj_length( AB ), nb_alg );
FLA_Repart_2x1_to_3x1( AT, &A0,
/* ** */ /* ** */
&A1,
AB, &A2, b, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( CT, &C0,
/* ** */ /* ** */
&C1,
CB, &C2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
/* C1 = alpha * A1 * B + C1; */
/* Pack C1 */
/* Note: the idea here is that, optionally,
- packed space is provided for computing packed_C1 = alpha * A1 * B
If C1 needs not be packed, then C can just be returned by this routine. */
FLA_Gemm_pack_C( FLA_NO_TRANSPOSE, C1 );
/* Pack A */
/* Note: the idea here is that, optionally,
- A is packed, and/or
- A is scaled
If A needs not be packed, it is not packed. If the multiplication
by alpha happens elsewhere, no scaling occurs. */
FLA_Gemm_pack_andor_scale_A( FLA_NO_TRANSPOSE, alpha, A1 );
/* Call the kernel routine */
FLA_Gemm_kernel( alpha, A1, B, C1 );
/* Unpack C1 */
/* Note: the idea here is that, optionally,
- packed_C1 is added to C1, possibly scaled at this point. */
FLA_Gemm_unpack_andor_scale_C( FLA_NO_TRANSPOSE, alpha, C1 );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &AT, A0,
A1,
/* ** */ /* ** */
&AB, A2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &CT, C0,
C1,
/* ** */ /* ** */
&CB, C2, FLA_TOP );
}
/* Release the space used to pack A1 */
/* Note: notice that the space provided for A1 can be recycled
everytime through the loop, which is why this call is outside the loop.
If the space is statically allocated, or A1 was not packed,
this could be a no-op. */
FLA_Gemm_release_pack_A( FLA_NO_TRANSPOSE, A1 );
//.........这里部分代码省略.........
示例11: FLA_Syr2k_un_unb_var4
FLA_Error FLA_Syr2k_un_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C )
{
FLA_Obj AT, A0,
AB, a1t,
A2;
FLA_Obj BT, B0,
BB, b1t,
B2;
FLA_Obj CTL, CTR, C00, c01, C02,
CBL, CBR, c10t, gamma11, c12t,
C20, c21, C22;
FLA_Scalr_external( FLA_UPPER_TRIANGULAR, beta, C );
FLA_Part_2x1( A, &AT,
&AB, 0, FLA_TOP );
FLA_Part_2x1( B, &BT,
&BB, 0, FLA_TOP );
FLA_Part_2x2( C, &CTL, &CTR,
&CBL, &CBR, 0, 0, FLA_TL );
while ( FLA_Obj_length( AT ) < FLA_Obj_length( A ) ) {
FLA_Repart_2x1_to_3x1( AT, &A0,
/* ** */ /* ** */
&a1t,
AB, &A2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( BT, &B0,
/* ** */ /* ** */
&b1t,
BB, &B2, 1, FLA_BOTTOM );
FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, /**/ &c01, &C02,
/* ************* */ /* ************************** */
&c10t, /**/ &gamma11, &c12t,
CBL, /**/ CBR, &C20, /**/ &c21, &C22,
1, 1, FLA_BR );
/*------------------------------------------------------------*/
/* c01 = c01 + A0 * b1t' */
FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, A0, b1t, FLA_ONE, c01 );
/* c01 = c01 + B0 * a1t' */
FLA_Gemv_external( FLA_NO_TRANSPOSE, alpha, B0, a1t, FLA_ONE, c01 );
/* gamma11 = gamma11 + a1t * b1t' + b1t * a1t' */
FLA_Dot2s_external( alpha, a1t, b1t, FLA_ONE, gamma11 );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &AT, A0,
a1t,
/* ** */ /* ** */
&AB, A2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &BT, B0,
b1t,
/* ** */ /* ** */
&BB, B2, FLA_TOP );
FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, c01, /**/ C02,
c10t, gamma11, /**/ c12t,
/* ************** */ /* ************************ */
&CBL, /**/ &CBR, C20, c21, /**/ C22,
FLA_TL );
}
return FLA_SUCCESS;
}
示例12: FLASH_Axpy_hierarchy
FLA_Error FLASH_Axpy_hierarchy( int direction, FLA_Obj alpha, FLA_Obj F, FLA_Obj* H )
{
// Once we get down to a submatrix whose elements are scalars, we are down
// to our base case.
if ( FLA_Obj_elemtype( *H ) == FLA_SCALAR )
{
// Depending on which top-level function invoked us, we either axpy
// the source data in the flat matrix to the leaf-level submatrix of
// the hierarchical matrix, or axpy the data in the hierarchical
// submatrix to the flat matrix.
if ( direction == FLA_FLAT_TO_HIER )
{
#ifdef FLA_ENABLE_SCC
if ( FLA_is_owner() )
#endif
FLA_Axpy_external( alpha, F, *H );
}
else if ( direction == FLA_HIER_TO_FLAT )
{
#ifdef FLA_ENABLE_SCC
if ( FLA_is_owner() )
#endif
FLA_Axpy_external( alpha, *H, F );
}
}
else
{
FLA_Obj HL, HR, H0, H1, H2;
FLA_Obj FL, FR, F0, F1, F2;
FLA_Obj H1T, H01,
H1B, H11,
H21;
FLA_Obj F1T, F01,
F1B, F11,
F21;
dim_t b_m;
dim_t b_n;
FLA_Part_1x2( *H, &HL, &HR, 0, FLA_LEFT );
FLA_Part_1x2( F, &FL, &FR, 0, FLA_LEFT );
while ( FLA_Obj_width( HL ) < FLA_Obj_width( *H ) )
{
FLA_Repart_1x2_to_1x3( HL, /**/ HR, &H0, /**/ &H1, &H2,
1, FLA_RIGHT );
// Get the scalar width of H1 and use that to determine the
// width of F1.
b_n = FLASH_Obj_scalar_width( H1 );
FLA_Repart_1x2_to_1x3( FL, /**/ FR, &F0, /**/ &F1, &F2,
b_n, FLA_RIGHT );
// -------------------------------------------------------------
FLA_Part_2x1( H1, &H1T,
&H1B, 0, FLA_TOP );
FLA_Part_2x1( F1, &F1T,
&F1B, 0, FLA_TOP );
while ( FLA_Obj_length( H1T ) < FLA_Obj_length( H1 ) )
{
FLA_Repart_2x1_to_3x1( H1T, &H01,
/* ** */ /* *** */
&H11,
H1B, &H21, 1, FLA_BOTTOM );
// Get the scalar length of H11 and use that to determine the
// length of F11.
b_m = FLASH_Obj_scalar_length( H11 );
FLA_Repart_2x1_to_3x1( F1T, &F01,
/* ** */ /* *** */
&F11,
F1B, &F21, b_m, FLA_BOTTOM );
// -------------------------------------------------------------
// Recursively axpy between F11 and H11.
FLASH_Axpy_hierarchy( direction, alpha, F11,
FLASH_OBJ_PTR_AT( H11 ) );
// -------------------------------------------------------------
FLA_Cont_with_3x1_to_2x1( &H1T, H01,
H11,
/* ** */ /* *** */
&H1B, H21, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &F1T, F01,
F11,
/* ** */ /* *** */
&F1B, F21, FLA_TOP );
}
// -------------------------------------------------------------
FLA_Cont_with_1x3_to_1x2( &HL, /**/ &HR, H0, H1, /**/ H2,
FLA_LEFT );
FLA_Cont_with_1x3_to_1x2( &FL, /**/ &FR, F0, F1, /**/ F2,
//.........这里部分代码省略.........
示例13: FLA_Sylv_nh_blk_var16
FLA_Error FLA_Sylv_nh_blk_var16( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl )
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj CT, C0,
CB, C1,
C2;
dim_t b;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_BR );
FLA_Part_2x1( C, &CT,
&CB, 0, FLA_BOTTOM );
while ( FLA_Obj_length( ABR ) < FLA_Obj_length( A ) ){
b = FLA_Determine_blocksize( CT, FLA_TOP, FLA_Cntl_blocksize( cntl ) );
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, &A01, /**/ &A02,
&A10, &A11, /**/ &A12,
/* ************* */ /* ******************** */
ABL, /**/ ABR, &A20, &A21, /**/ &A22,
b, b, FLA_TL );
FLA_Repart_2x1_to_3x1( CT, &C0,
&C1,
/* ** */ /* ** */
CB, &C2, b, FLA_TOP );
// Loop Invariant:
// CT = CT - ATR * sylv( ABR, B', CB )
// CB = sylv( ABR, B', CB )
/*------------------------------------------------------------*/
// C1 = sylv( A11, B', C1 );
FLA_Sylv_internal( FLA_NO_TRANSPOSE, FLA_CONJ_TRANSPOSE,
isgn, A11, B, C1, scale,
FLA_Cntl_sub_sylv1( cntl ) );
// C0 = C0 - A01 * C1;
FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_MINUS_ONE, A01, C1, FLA_ONE, C0,
FLA_Cntl_sub_gemm1( cntl ) );
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, /**/ A01, A02,
/* ************** */ /* ****************** */
A10, /**/ A11, A12,
&ABL, /**/ &ABR, A20, /**/ A21, A22,
FLA_BR );
FLA_Cont_with_3x1_to_2x1( &CT, C0,
/* ** */ /* ** */
C1,
&CB, C2, FLA_BOTTOM );
}
return FLA_SUCCESS;
}
示例14: Symm_blk_var1
int Symm_blk_var1( FLA_Obj A, FLA_Obj B, FLA_Obj C, int nb_alg )
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj BT, B0,
BB, B1,
B2;
FLA_Obj CT, C0,
CB, C1,
C2;
int b;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_2x1( B, &BT,
&BB, 0, FLA_TOP );
FLA_Part_2x1( C, &CT,
&CB, 0, FLA_TOP );
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) ){
b = min( FLA_Obj_length( ABR ), nb_alg );
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
/* ************* */ /* ******************** */
&A10, /**/ &A11, &A12,
ABL, /**/ ABR, &A20, /**/ &A21, &A22,
b, b, FLA_BR );
FLA_Repart_2x1_to_3x1( BT, &B0,
/* ** */ /* ** */
&B1,
BB, &B2, b, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( CT, &C0,
/* ** */ /* ** */
&C1,
CB, &C2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
C1 = C1 + A10*B0 + A11*B1;
C0 = C0 + A10*B1;
/* update line 1 */
/* : */
/* update line n */
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
A10, A11, /**/ A12,
/* ************** */ /* ****************** */
&ABL, /**/ &ABR, A20, A21, /**/ A22,
FLA_TL );
FLA_Cont_with_3x1_to_2x1( &BT, B0,
B1,
/* ** */ /* ** */
&BB, B2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &CT, C0,
C1,
/* ** */ /* ** */
&CB, C2, FLA_TOP );
}
return FLA_SUCCESS;
}
示例15: FLA_Syrk_un_blk_var3
FLA_Error FLA_Syrk_un_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl )
{
FLA_Obj AT, A0,
AB, A1,
A2;
FLA_Obj CTL, CTR, C00, C01, C02,
CBL, CBR, C10, C11, C12,
C20, C21, C22;
dim_t b;
FLA_Part_2x1( A, &AT,
&AB, 0, FLA_BOTTOM );
FLA_Part_2x2( C, &CTL, &CTR,
&CBL, &CBR, 0, 0, FLA_BR );
while ( FLA_Obj_length( AB ) < FLA_Obj_length( A ) ){
b = FLA_Determine_blocksize( AT, FLA_TOP, FLA_Cntl_blocksize( cntl ) );
FLA_Repart_2x1_to_3x1( AT, &A0,
&A1,
/* ** */ /* ** */
AB, &A2, b, FLA_TOP );
FLA_Repart_2x2_to_3x3( CTL, /**/ CTR, &C00, &C01, /**/ &C02,
&C10, &C11, /**/ &C12,
/* ************* */ /* ******************** */
CBL, /**/ CBR, &C20, &C21, /**/ &C22,
b, b, FLA_TL );
/*------------------------------------------------------------*/
/* C12 = C12 + A1 * A2' */
FLA_Gemm_internal( FLA_NO_TRANSPOSE, FLA_TRANSPOSE,
alpha, A1, A2, beta, C12,
FLA_Cntl_sub_gemm( cntl ) );
/* C11 = C11 + A1 * A1' */
FLA_Syrk_internal( FLA_UPPER_TRIANGULAR, FLA_NO_TRANSPOSE,
alpha, A1, beta, C11,
FLA_Cntl_sub_syrk( cntl ) );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &AT, A0,
/* ** */ /* ** */
A1,
&AB, A2, FLA_BOTTOM );
FLA_Cont_with_3x3_to_2x2( &CTL, /**/ &CTR, C00, /**/ C01, C02,
/* ************** */ /* ****************** */
C10, /**/ C11, C12,
&CBL, /**/ &CBR, C20, /**/ C21, C22,
FLA_BR );
}
return FLA_SUCCESS;
}