/**
 *  \file src/MBCSR/MatTrisolve/8x7.c
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of sparse triangular solve.
 *  \ingroup MATTYPE_MBCSR
 *
 *  Automatically generated by ./gen_trisolve.sh
 *  on Wed Jun  8 16:10:08 PDT 2005.
 */

#include <assert.h>

#include <oski/config.h> /* for 'restrict' keyword */
#include <oski/common.h>
#include <oski/matrix.h>
#include <oski/blas_names.h>
#include <oski/MBCSR/format.h>
#include <oski/MBCSR/module.h>

#if IS_VAL_COMPLEX
	/** Complex-valued, so do not use explicit 'register' keyword. */
	#define REGISTER
#else
	/** Real-valued, so use explicit 'register' keyword. */
	#define REGISTER register
#endif

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatTrisolve_Lower_v1_aX_xs1. */
	#define MBCSR_MatTrisolve_Lower_v1_aX_xs1 \
		MANGLE_MOD_(MBCSR_MatTrisolve_Lower_v1_aX_xs1_8x7)
#endif

/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$L^{-1}\cdot b\f$, where x has unit-stride.
 */
void
MBCSR_MatTrisolve_Lower_v1_aX_xs1(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x )
{
	oski_index_t I;
	oski_value_t* bp = x + d0;
	const oski_value_t* dp = diag;

	for( I = 1; I != M+1; I++, bp += 8, dp += 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_MUL( _b0, alpha, bp[0] );
		VAL_MUL( _b1, alpha, bp[1] );
		VAL_MUL( _b2, alpha, bp[2] );
		VAL_MUL( _b3, alpha, bp[3] );
		VAL_MUL( _b4, alpha, bp[4] );
		VAL_MUL( _b5, alpha, bp[5] );
		VAL_MUL( _b6, alpha, bp[6] );
		VAL_MUL( _b7, alpha, bp[7] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			const oski_value_t* xp = x + j0;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1] );
			VAL_ASSIGN( _x2, xp[2] );
			VAL_ASSIGN( _x3, xp[3] );
			VAL_ASSIGN( _x4, xp[4] );
			VAL_ASSIGN( _x5, xp[5] );
			VAL_ASSIGN( _x6, xp[6] );

			VAL_MSUB( _b0, vp[0], _x0 );
			VAL_MSUB( _b1, vp[7], _x0 );
			VAL_MSUB( _b2, vp[14], _x0 );
			VAL_MSUB( _b3, vp[21], _x0 );
			VAL_MSUB( _b4, vp[28], _x0 );
			VAL_MSUB( _b5, vp[35], _x0 );
			VAL_MSUB( _b6, vp[42], _x0 );
			VAL_MSUB( _b7, vp[49], _x0 );
			VAL_MSUB( _b0, vp[1], _x1 );
			VAL_MSUB( _b1, vp[8], _x1 );
			VAL_MSUB( _b2, vp[15], _x1 );
			VAL_MSUB( _b3, vp[22], _x1 );
			VAL_MSUB( _b4, vp[29], _x1 );
			VAL_MSUB( _b5, vp[36], _x1 );
			VAL_MSUB( _b6, vp[43], _x1 );
			VAL_MSUB( _b7, vp[50], _x1 );
			VAL_MSUB( _b0, vp[2], _x2 );
			VAL_MSUB( _b1, vp[9], _x2 );
			VAL_MSUB( _b2, vp[16], _x2 );
			VAL_MSUB( _b3, vp[23], _x2 );
			VAL_MSUB( _b4, vp[30], _x2 );
			VAL_MSUB( _b5, vp[37], _x2 );
			VAL_MSUB( _b6, vp[44], _x2 );
			VAL_MSUB( _b7, vp[51], _x2 );
			VAL_MSUB( _b0, vp[3], _x3 );
			VAL_MSUB( _b1, vp[10], _x3 );
			VAL_MSUB( _b2, vp[17], _x3 );
			VAL_MSUB( _b3, vp[24], _x3 );
			VAL_MSUB( _b4, vp[31], _x3 );
			VAL_MSUB( _b5, vp[38], _x3 );
			VAL_MSUB( _b6, vp[45], _x3 );
			VAL_MSUB( _b7, vp[52], _x3 );
			VAL_MSUB( _b0, vp[4], _x4 );
			VAL_MSUB( _b1, vp[11], _x4 );
			VAL_MSUB( _b2, vp[18], _x4 );
			VAL_MSUB( _b3, vp[25], _x4 );
			VAL_MSUB( _b4, vp[32], _x4 );
			VAL_MSUB( _b5, vp[39], _x4 );
			VAL_MSUB( _b6, vp[46], _x4 );
			VAL_MSUB( _b7, vp[53], _x4 );
			VAL_MSUB( _b0, vp[5], _x5 );
			VAL_MSUB( _b1, vp[12], _x5 );
			VAL_MSUB( _b2, vp[19], _x5 );
			VAL_MSUB( _b3, vp[26], _x5 );
			VAL_MSUB( _b4, vp[33], _x5 );
			VAL_MSUB( _b5, vp[40], _x5 );
			VAL_MSUB( _b6, vp[47], _x5 );
			VAL_MSUB( _b7, vp[54], _x5 );
			VAL_MSUB( _b0, vp[6], _x6 );
			VAL_MSUB( _b1, vp[13], _x6 );
			VAL_MSUB( _b2, vp[20], _x6 );
			VAL_MSUB( _b3, vp[27], _x6 );
			VAL_MSUB( _b4, vp[34], _x6 );
			VAL_MSUB( _b5, vp[41], _x6 );
			VAL_MSUB( _b6, vp[48], _x6 );
			VAL_MSUB( _b7, vp[55], _x6 );
		} /* K */

		VAL_DIVEQ( _b0, dp[0*8+0] );
		VAL_MSUB( _b1, dp[1*8+0], _b0 );
		VAL_MSUB( _b2, dp[2*8+0], _b0 );
		VAL_MSUB( _b3, dp[3*8+0], _b0 );
		VAL_MSUB( _b4, dp[4*8+0], _b0 );
		VAL_MSUB( _b5, dp[5*8+0], _b0 );
		VAL_MSUB( _b6, dp[6*8+0], _b0 );
		VAL_MSUB( _b7, dp[7*8+0], _b0 );
		VAL_DIVEQ( _b1, dp[1*8+1] );
		VAL_MSUB( _b2, dp[2*8+1], _b1 );
		VAL_MSUB( _b3, dp[3*8+1], _b1 );
		VAL_MSUB( _b4, dp[4*8+1], _b1 );
		VAL_MSUB( _b5, dp[5*8+1], _b1 );
		VAL_MSUB( _b6, dp[6*8+1], _b1 );
		VAL_MSUB( _b7, dp[7*8+1], _b1 );
		VAL_DIVEQ( _b2, dp[2*8+2] );
		VAL_MSUB( _b3, dp[3*8+2], _b2 );
		VAL_MSUB( _b4, dp[4*8+2], _b2 );
		VAL_MSUB( _b5, dp[5*8+2], _b2 );
		VAL_MSUB( _b6, dp[6*8+2], _b2 );
		VAL_MSUB( _b7, dp[7*8+2], _b2 );
		VAL_DIVEQ( _b3, dp[3*8+3] );
		VAL_MSUB( _b4, dp[4*8+3], _b3 );
		VAL_MSUB( _b5, dp[5*8+3], _b3 );
		VAL_MSUB( _b6, dp[6*8+3], _b3 );
		VAL_MSUB( _b7, dp[7*8+3], _b3 );
		VAL_DIVEQ( _b4, dp[4*8+4] );
		VAL_MSUB( _b5, dp[5*8+4], _b4 );
		VAL_MSUB( _b6, dp[6*8+4], _b4 );
		VAL_MSUB( _b7, dp[7*8+4], _b4 );
		VAL_DIVEQ( _b5, dp[5*8+5] );
		VAL_MSUB( _b6, dp[6*8+5], _b5 );
		VAL_MSUB( _b7, dp[7*8+5], _b5 );
		VAL_DIVEQ( _b6, dp[6*8+6] );
		VAL_MSUB( _b7, dp[7*8+6], _b6 );
		VAL_DIVEQ( _b7, dp[7*8+7] );
		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1], _b1 );
		VAL_ASSIGN( bp[2], _b2 );
		VAL_ASSIGN( bp[3], _b3 );
		VAL_ASSIGN( bp[4], _b4 );
		VAL_ASSIGN( bp[5], _b5 );
		VAL_ASSIGN( bp[6], _b6 );
		VAL_ASSIGN( bp[7], _b7 );
	} /* I */
} /* MBCSR_MatTrisolve_Lower_v1_aX_xs1 */

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatTrisolve_Lower_v1_aX_xsX. */
	#define MBCSR_MatTrisolve_Lower_v1_aX_xsX \
		MANGLE_MOD_(MBCSR_MatTrisolve_Lower_v1_aX_xsX_8x7)
#endif

/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$L^{-1}\cdot b\f$, where x has general-stride.
 */
void
MBCSR_MatTrisolve_Lower_v1_aX_xsX(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x, oski_index_t incx )
{
	oski_index_t I;
	oski_value_t* bp = x + d0*incx;
	const oski_value_t* dp = diag;

	for( I = 1; I != M+1; I++, bp += 8*incx, dp += 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_MUL( _b0, alpha, bp[0] );
		VAL_MUL( _b1, alpha, bp[1*incx] );
		VAL_MUL( _b2, alpha, bp[2*incx] );
		VAL_MUL( _b3, alpha, bp[3*incx] );
		VAL_MUL( _b4, alpha, bp[4*incx] );
		VAL_MUL( _b5, alpha, bp[5*incx] );
		VAL_MUL( _b6, alpha, bp[6*incx] );
		VAL_MUL( _b7, alpha, bp[7*incx] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			const oski_value_t* xp = x + j0*incx;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1*incx] );
			VAL_ASSIGN( _x2, xp[2*incx] );
			VAL_ASSIGN( _x3, xp[3*incx] );
			VAL_ASSIGN( _x4, xp[4*incx] );
			VAL_ASSIGN( _x5, xp[5*incx] );
			VAL_ASSIGN( _x6, xp[6*incx] );

			VAL_MSUB( _b0, vp[0], _x0 );
			VAL_MSUB( _b1, vp[7], _x0 );
			VAL_MSUB( _b2, vp[14], _x0 );
			VAL_MSUB( _b3, vp[21], _x0 );
			VAL_MSUB( _b4, vp[28], _x0 );
			VAL_MSUB( _b5, vp[35], _x0 );
			VAL_MSUB( _b6, vp[42], _x0 );
			VAL_MSUB( _b7, vp[49], _x0 );
			VAL_MSUB( _b0, vp[1], _x1 );
			VAL_MSUB( _b1, vp[8], _x1 );
			VAL_MSUB( _b2, vp[15], _x1 );
			VAL_MSUB( _b3, vp[22], _x1 );
			VAL_MSUB( _b4, vp[29], _x1 );
			VAL_MSUB( _b5, vp[36], _x1 );
			VAL_MSUB( _b6, vp[43], _x1 );
			VAL_MSUB( _b7, vp[50], _x1 );
			VAL_MSUB( _b0, vp[2], _x2 );
			VAL_MSUB( _b1, vp[9], _x2 );
			VAL_MSUB( _b2, vp[16], _x2 );
			VAL_MSUB( _b3, vp[23], _x2 );
			VAL_MSUB( _b4, vp[30], _x2 );
			VAL_MSUB( _b5, vp[37], _x2 );
			VAL_MSUB( _b6, vp[44], _x2 );
			VAL_MSUB( _b7, vp[51], _x2 );
			VAL_MSUB( _b0, vp[3], _x3 );
			VAL_MSUB( _b1, vp[10], _x3 );
			VAL_MSUB( _b2, vp[17], _x3 );
			VAL_MSUB( _b3, vp[24], _x3 );
			VAL_MSUB( _b4, vp[31], _x3 );
			VAL_MSUB( _b5, vp[38], _x3 );
			VAL_MSUB( _b6, vp[45], _x3 );
			VAL_MSUB( _b7, vp[52], _x3 );
			VAL_MSUB( _b0, vp[4], _x4 );
			VAL_MSUB( _b1, vp[11], _x4 );
			VAL_MSUB( _b2, vp[18], _x4 );
			VAL_MSUB( _b3, vp[25], _x4 );
			VAL_MSUB( _b4, vp[32], _x4 );
			VAL_MSUB( _b5, vp[39], _x4 );
			VAL_MSUB( _b6, vp[46], _x4 );
			VAL_MSUB( _b7, vp[53], _x4 );
			VAL_MSUB( _b0, vp[5], _x5 );
			VAL_MSUB( _b1, vp[12], _x5 );
			VAL_MSUB( _b2, vp[19], _x5 );
			VAL_MSUB( _b3, vp[26], _x5 );
			VAL_MSUB( _b4, vp[33], _x5 );
			VAL_MSUB( _b5, vp[40], _x5 );
			VAL_MSUB( _b6, vp[47], _x5 );
			VAL_MSUB( _b7, vp[54], _x5 );
			VAL_MSUB( _b0, vp[6], _x6 );
			VAL_MSUB( _b1, vp[13], _x6 );
			VAL_MSUB( _b2, vp[20], _x6 );
			VAL_MSUB( _b3, vp[27], _x6 );
			VAL_MSUB( _b4, vp[34], _x6 );
			VAL_MSUB( _b5, vp[41], _x6 );
			VAL_MSUB( _b6, vp[48], _x6 );
			VAL_MSUB( _b7, vp[55], _x6 );
		} /* K */

		VAL_DIVEQ( _b0, dp[0*8+0] );
		VAL_MSUB( _b1, dp[1*8+0], _b0 );
		VAL_MSUB( _b2, dp[2*8+0], _b0 );
		VAL_MSUB( _b3, dp[3*8+0], _b0 );
		VAL_MSUB( _b4, dp[4*8+0], _b0 );
		VAL_MSUB( _b5, dp[5*8+0], _b0 );
		VAL_MSUB( _b6, dp[6*8+0], _b0 );
		VAL_MSUB( _b7, dp[7*8+0], _b0 );
		VAL_DIVEQ( _b1, dp[1*8+1] );
		VAL_MSUB( _b2, dp[2*8+1], _b1 );
		VAL_MSUB( _b3, dp[3*8+1], _b1 );
		VAL_MSUB( _b4, dp[4*8+1], _b1 );
		VAL_MSUB( _b5, dp[5*8+1], _b1 );
		VAL_MSUB( _b6, dp[6*8+1], _b1 );
		VAL_MSUB( _b7, dp[7*8+1], _b1 );
		VAL_DIVEQ( _b2, dp[2*8+2] );
		VAL_MSUB( _b3, dp[3*8+2], _b2 );
		VAL_MSUB( _b4, dp[4*8+2], _b2 );
		VAL_MSUB( _b5, dp[5*8+2], _b2 );
		VAL_MSUB( _b6, dp[6*8+2], _b2 );
		VAL_MSUB( _b7, dp[7*8+2], _b2 );
		VAL_DIVEQ( _b3, dp[3*8+3] );
		VAL_MSUB( _b4, dp[4*8+3], _b3 );
		VAL_MSUB( _b5, dp[5*8+3], _b3 );
		VAL_MSUB( _b6, dp[6*8+3], _b3 );
		VAL_MSUB( _b7, dp[7*8+3], _b3 );
		VAL_DIVEQ( _b4, dp[4*8+4] );
		VAL_MSUB( _b5, dp[5*8+4], _b4 );
		VAL_MSUB( _b6, dp[6*8+4], _b4 );
		VAL_MSUB( _b7, dp[7*8+4], _b4 );
		VAL_DIVEQ( _b5, dp[5*8+5] );
		VAL_MSUB( _b6, dp[6*8+5], _b5 );
		VAL_MSUB( _b7, dp[7*8+5], _b5 );
		VAL_DIVEQ( _b6, dp[6*8+6] );
		VAL_MSUB( _b7, dp[7*8+6], _b6 );
		VAL_DIVEQ( _b7, dp[7*8+7] );
		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1*incx], _b1 );
		VAL_ASSIGN( bp[2*incx], _b2 );
		VAL_ASSIGN( bp[3*incx], _b3 );
		VAL_ASSIGN( bp[4*incx], _b4 );
		VAL_ASSIGN( bp[5*incx], _b5 );
		VAL_ASSIGN( bp[6*incx], _b6 );
		VAL_ASSIGN( bp[7*incx], _b7 );
	} /* I */
} /* MBCSR_MatTrisolve_Lower_v1_aX_xsX */

/**
 *  \brief Exported module wrapper for the \f$8\times 7\f$
 *  implementation of the sparse triangular solve operation,
 *  where the matrix is lower triangular.
 */
static void
MatTrisolve_Lower( const oski_submatMBCSR_t* T,
	oski_value_t alpha, oski_vecview_t x )
{
	assert( T != NULL );
	assert( T->r == 8 );
	assert( T->c == 7 );
	assert( x != INVALID_VEC );

	if( x->rowinc == 1 ) {
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatTrisolve_Lower_v1_aX_xs1( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp );
	} else { /* x has general (non-unit) stride */
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatTrisolve_Lower_v1_aX_xsX( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp, x->rowinc );
	}
}

#if !IS_VAL_COMPLEX
	/** Synonym for pure real version */
	#define MBCSR_MatConjTrisolve_Lower_v1_aX_xs1 MBCSR_MatTrisolve_Lower_v1_aX_xs1
#else /* IS_VAL_COMPLEX */

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatConjTrisolve_Lower_v1_aX_xs1. */
	#define MBCSR_MatConjTrisolve_Lower_v1_aX_xs1 \
		MANGLE_MOD_(MBCSR_MatConjTrisolve_Lower_v1_aX_xs1_8x7)
#endif

#endif
#if IS_VAL_COMPLEX
/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$\bar{L}^{-1}\cdot b\f$, where x has unit-stride.
 */
void
MBCSR_MatConjTrisolve_Lower_v1_aX_xs1(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x )
{
	oski_index_t I;
	oski_value_t* bp = x + d0;
	const oski_value_t* dp = diag;

	for( I = 1; I != M+1; I++, bp += 8, dp += 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_MUL( _b0, alpha, bp[0] );
		VAL_MUL( _b1, alpha, bp[1] );
		VAL_MUL( _b2, alpha, bp[2] );
		VAL_MUL( _b3, alpha, bp[3] );
		VAL_MUL( _b4, alpha, bp[4] );
		VAL_MUL( _b5, alpha, bp[5] );
		VAL_MUL( _b6, alpha, bp[6] );
		VAL_MUL( _b7, alpha, bp[7] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			const oski_value_t* xp = x + j0;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1] );
			VAL_ASSIGN( _x2, xp[2] );
			VAL_ASSIGN( _x3, xp[3] );
			VAL_ASSIGN( _x4, xp[4] );
			VAL_ASSIGN( _x5, xp[5] );
			VAL_ASSIGN( _x6, xp[6] );

			VAL_MSUB_CONJ( _b0, vp[0], _x0 );
			VAL_MSUB_CONJ( _b1, vp[7], _x0 );
			VAL_MSUB_CONJ( _b2, vp[14], _x0 );
			VAL_MSUB_CONJ( _b3, vp[21], _x0 );
			VAL_MSUB_CONJ( _b4, vp[28], _x0 );
			VAL_MSUB_CONJ( _b5, vp[35], _x0 );
			VAL_MSUB_CONJ( _b6, vp[42], _x0 );
			VAL_MSUB_CONJ( _b7, vp[49], _x0 );
			VAL_MSUB_CONJ( _b0, vp[1], _x1 );
			VAL_MSUB_CONJ( _b1, vp[8], _x1 );
			VAL_MSUB_CONJ( _b2, vp[15], _x1 );
			VAL_MSUB_CONJ( _b3, vp[22], _x1 );
			VAL_MSUB_CONJ( _b4, vp[29], _x1 );
			VAL_MSUB_CONJ( _b5, vp[36], _x1 );
			VAL_MSUB_CONJ( _b6, vp[43], _x1 );
			VAL_MSUB_CONJ( _b7, vp[50], _x1 );
			VAL_MSUB_CONJ( _b0, vp[2], _x2 );
			VAL_MSUB_CONJ( _b1, vp[9], _x2 );
			VAL_MSUB_CONJ( _b2, vp[16], _x2 );
			VAL_MSUB_CONJ( _b3, vp[23], _x2 );
			VAL_MSUB_CONJ( _b4, vp[30], _x2 );
			VAL_MSUB_CONJ( _b5, vp[37], _x2 );
			VAL_MSUB_CONJ( _b6, vp[44], _x2 );
			VAL_MSUB_CONJ( _b7, vp[51], _x2 );
			VAL_MSUB_CONJ( _b0, vp[3], _x3 );
			VAL_MSUB_CONJ( _b1, vp[10], _x3 );
			VAL_MSUB_CONJ( _b2, vp[17], _x3 );
			VAL_MSUB_CONJ( _b3, vp[24], _x3 );
			VAL_MSUB_CONJ( _b4, vp[31], _x3 );
			VAL_MSUB_CONJ( _b5, vp[38], _x3 );
			VAL_MSUB_CONJ( _b6, vp[45], _x3 );
			VAL_MSUB_CONJ( _b7, vp[52], _x3 );
			VAL_MSUB_CONJ( _b0, vp[4], _x4 );
			VAL_MSUB_CONJ( _b1, vp[11], _x4 );
			VAL_MSUB_CONJ( _b2, vp[18], _x4 );
			VAL_MSUB_CONJ( _b3, vp[25], _x4 );
			VAL_MSUB_CONJ( _b4, vp[32], _x4 );
			VAL_MSUB_CONJ( _b5, vp[39], _x4 );
			VAL_MSUB_CONJ( _b6, vp[46], _x4 );
			VAL_MSUB_CONJ( _b7, vp[53], _x4 );
			VAL_MSUB_CONJ( _b0, vp[5], _x5 );
			VAL_MSUB_CONJ( _b1, vp[12], _x5 );
			VAL_MSUB_CONJ( _b2, vp[19], _x5 );
			VAL_MSUB_CONJ( _b3, vp[26], _x5 );
			VAL_MSUB_CONJ( _b4, vp[33], _x5 );
			VAL_MSUB_CONJ( _b5, vp[40], _x5 );
			VAL_MSUB_CONJ( _b6, vp[47], _x5 );
			VAL_MSUB_CONJ( _b7, vp[54], _x5 );
			VAL_MSUB_CONJ( _b0, vp[6], _x6 );
			VAL_MSUB_CONJ( _b1, vp[13], _x6 );
			VAL_MSUB_CONJ( _b2, vp[20], _x6 );
			VAL_MSUB_CONJ( _b3, vp[27], _x6 );
			VAL_MSUB_CONJ( _b4, vp[34], _x6 );
			VAL_MSUB_CONJ( _b5, vp[41], _x6 );
			VAL_MSUB_CONJ( _b6, vp[48], _x6 );
			VAL_MSUB_CONJ( _b7, vp[55], _x6 );
		} /* K */

		VAL_DIVEQ_CONJ( _b0, dp[0*8+0] );
		VAL_MSUB_CONJ( _b1, dp[1*8+0], _b0 );
		VAL_MSUB_CONJ( _b2, dp[2*8+0], _b0 );
		VAL_MSUB_CONJ( _b3, dp[3*8+0], _b0 );
		VAL_MSUB_CONJ( _b4, dp[4*8+0], _b0 );
		VAL_MSUB_CONJ( _b5, dp[5*8+0], _b0 );
		VAL_MSUB_CONJ( _b6, dp[6*8+0], _b0 );
		VAL_MSUB_CONJ( _b7, dp[7*8+0], _b0 );
		VAL_DIVEQ_CONJ( _b1, dp[1*8+1] );
		VAL_MSUB_CONJ( _b2, dp[2*8+1], _b1 );
		VAL_MSUB_CONJ( _b3, dp[3*8+1], _b1 );
		VAL_MSUB_CONJ( _b4, dp[4*8+1], _b1 );
		VAL_MSUB_CONJ( _b5, dp[5*8+1], _b1 );
		VAL_MSUB_CONJ( _b6, dp[6*8+1], _b1 );
		VAL_MSUB_CONJ( _b7, dp[7*8+1], _b1 );
		VAL_DIVEQ_CONJ( _b2, dp[2*8+2] );
		VAL_MSUB_CONJ( _b3, dp[3*8+2], _b2 );
		VAL_MSUB_CONJ( _b4, dp[4*8+2], _b2 );
		VAL_MSUB_CONJ( _b5, dp[5*8+2], _b2 );
		VAL_MSUB_CONJ( _b6, dp[6*8+2], _b2 );
		VAL_MSUB_CONJ( _b7, dp[7*8+2], _b2 );
		VAL_DIVEQ_CONJ( _b3, dp[3*8+3] );
		VAL_MSUB_CONJ( _b4, dp[4*8+3], _b3 );
		VAL_MSUB_CONJ( _b5, dp[5*8+3], _b3 );
		VAL_MSUB_CONJ( _b6, dp[6*8+3], _b3 );
		VAL_MSUB_CONJ( _b7, dp[7*8+3], _b3 );
		VAL_DIVEQ_CONJ( _b4, dp[4*8+4] );
		VAL_MSUB_CONJ( _b5, dp[5*8+4], _b4 );
		VAL_MSUB_CONJ( _b6, dp[6*8+4], _b4 );
		VAL_MSUB_CONJ( _b7, dp[7*8+4], _b4 );
		VAL_DIVEQ_CONJ( _b5, dp[5*8+5] );
		VAL_MSUB_CONJ( _b6, dp[6*8+5], _b5 );
		VAL_MSUB_CONJ( _b7, dp[7*8+5], _b5 );
		VAL_DIVEQ_CONJ( _b6, dp[6*8+6] );
		VAL_MSUB_CONJ( _b7, dp[7*8+6], _b6 );
		VAL_DIVEQ_CONJ( _b7, dp[7*8+7] );
		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1], _b1 );
		VAL_ASSIGN( bp[2], _b2 );
		VAL_ASSIGN( bp[3], _b3 );
		VAL_ASSIGN( bp[4], _b4 );
		VAL_ASSIGN( bp[5], _b5 );
		VAL_ASSIGN( bp[6], _b6 );
		VAL_ASSIGN( bp[7], _b7 );
	} /* I */
} /* MBCSR_MatConjTrisolve_Lower_v1_aX_xs1 */

#endif /* IS_VAL_COMPLEX */
#if !IS_VAL_COMPLEX
	/** Synonym for pure real version */
	#define MBCSR_MatConjTrisolve_Lower_v1_aX_xsX MBCSR_MatTrisolve_Lower_v1_aX_xsX
#else /* IS_VAL_COMPLEX */

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatConjTrisolve_Lower_v1_aX_xsX. */
	#define MBCSR_MatConjTrisolve_Lower_v1_aX_xsX \
		MANGLE_MOD_(MBCSR_MatConjTrisolve_Lower_v1_aX_xsX_8x7)
#endif

#endif
#if IS_VAL_COMPLEX
/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$\bar{L}^{-1}\cdot b\f$, where x has general-stride.
 */
void
MBCSR_MatConjTrisolve_Lower_v1_aX_xsX(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x, oski_index_t incx )
{
	oski_index_t I;
	oski_value_t* bp = x + d0*incx;
	const oski_value_t* dp = diag;

	for( I = 1; I != M+1; I++, bp += 8*incx, dp += 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_MUL( _b0, alpha, bp[0] );
		VAL_MUL( _b1, alpha, bp[1*incx] );
		VAL_MUL( _b2, alpha, bp[2*incx] );
		VAL_MUL( _b3, alpha, bp[3*incx] );
		VAL_MUL( _b4, alpha, bp[4*incx] );
		VAL_MUL( _b5, alpha, bp[5*incx] );
		VAL_MUL( _b6, alpha, bp[6*incx] );
		VAL_MUL( _b7, alpha, bp[7*incx] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			const oski_value_t* xp = x + j0*incx;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1*incx] );
			VAL_ASSIGN( _x2, xp[2*incx] );
			VAL_ASSIGN( _x3, xp[3*incx] );
			VAL_ASSIGN( _x4, xp[4*incx] );
			VAL_ASSIGN( _x5, xp[5*incx] );
			VAL_ASSIGN( _x6, xp[6*incx] );

			VAL_MSUB_CONJ( _b0, vp[0], _x0 );
			VAL_MSUB_CONJ( _b1, vp[7], _x0 );
			VAL_MSUB_CONJ( _b2, vp[14], _x0 );
			VAL_MSUB_CONJ( _b3, vp[21], _x0 );
			VAL_MSUB_CONJ( _b4, vp[28], _x0 );
			VAL_MSUB_CONJ( _b5, vp[35], _x0 );
			VAL_MSUB_CONJ( _b6, vp[42], _x0 );
			VAL_MSUB_CONJ( _b7, vp[49], _x0 );
			VAL_MSUB_CONJ( _b0, vp[1], _x1 );
			VAL_MSUB_CONJ( _b1, vp[8], _x1 );
			VAL_MSUB_CONJ( _b2, vp[15], _x1 );
			VAL_MSUB_CONJ( _b3, vp[22], _x1 );
			VAL_MSUB_CONJ( _b4, vp[29], _x1 );
			VAL_MSUB_CONJ( _b5, vp[36], _x1 );
			VAL_MSUB_CONJ( _b6, vp[43], _x1 );
			VAL_MSUB_CONJ( _b7, vp[50], _x1 );
			VAL_MSUB_CONJ( _b0, vp[2], _x2 );
			VAL_MSUB_CONJ( _b1, vp[9], _x2 );
			VAL_MSUB_CONJ( _b2, vp[16], _x2 );
			VAL_MSUB_CONJ( _b3, vp[23], _x2 );
			VAL_MSUB_CONJ( _b4, vp[30], _x2 );
			VAL_MSUB_CONJ( _b5, vp[37], _x2 );
			VAL_MSUB_CONJ( _b6, vp[44], _x2 );
			VAL_MSUB_CONJ( _b7, vp[51], _x2 );
			VAL_MSUB_CONJ( _b0, vp[3], _x3 );
			VAL_MSUB_CONJ( _b1, vp[10], _x3 );
			VAL_MSUB_CONJ( _b2, vp[17], _x3 );
			VAL_MSUB_CONJ( _b3, vp[24], _x3 );
			VAL_MSUB_CONJ( _b4, vp[31], _x3 );
			VAL_MSUB_CONJ( _b5, vp[38], _x3 );
			VAL_MSUB_CONJ( _b6, vp[45], _x3 );
			VAL_MSUB_CONJ( _b7, vp[52], _x3 );
			VAL_MSUB_CONJ( _b0, vp[4], _x4 );
			VAL_MSUB_CONJ( _b1, vp[11], _x4 );
			VAL_MSUB_CONJ( _b2, vp[18], _x4 );
			VAL_MSUB_CONJ( _b3, vp[25], _x4 );
			VAL_MSUB_CONJ( _b4, vp[32], _x4 );
			VAL_MSUB_CONJ( _b5, vp[39], _x4 );
			VAL_MSUB_CONJ( _b6, vp[46], _x4 );
			VAL_MSUB_CONJ( _b7, vp[53], _x4 );
			VAL_MSUB_CONJ( _b0, vp[5], _x5 );
			VAL_MSUB_CONJ( _b1, vp[12], _x5 );
			VAL_MSUB_CONJ( _b2, vp[19], _x5 );
			VAL_MSUB_CONJ( _b3, vp[26], _x5 );
			VAL_MSUB_CONJ( _b4, vp[33], _x5 );
			VAL_MSUB_CONJ( _b5, vp[40], _x5 );
			VAL_MSUB_CONJ( _b6, vp[47], _x5 );
			VAL_MSUB_CONJ( _b7, vp[54], _x5 );
			VAL_MSUB_CONJ( _b0, vp[6], _x6 );
			VAL_MSUB_CONJ( _b1, vp[13], _x6 );
			VAL_MSUB_CONJ( _b2, vp[20], _x6 );
			VAL_MSUB_CONJ( _b3, vp[27], _x6 );
			VAL_MSUB_CONJ( _b4, vp[34], _x6 );
			VAL_MSUB_CONJ( _b5, vp[41], _x6 );
			VAL_MSUB_CONJ( _b6, vp[48], _x6 );
			VAL_MSUB_CONJ( _b7, vp[55], _x6 );
		} /* K */

		VAL_DIVEQ_CONJ( _b0, dp[0*8+0] );
		VAL_MSUB_CONJ( _b1, dp[1*8+0], _b0 );
		VAL_MSUB_CONJ( _b2, dp[2*8+0], _b0 );
		VAL_MSUB_CONJ( _b3, dp[3*8+0], _b0 );
		VAL_MSUB_CONJ( _b4, dp[4*8+0], _b0 );
		VAL_MSUB_CONJ( _b5, dp[5*8+0], _b0 );
		VAL_MSUB_CONJ( _b6, dp[6*8+0], _b0 );
		VAL_MSUB_CONJ( _b7, dp[7*8+0], _b0 );
		VAL_DIVEQ_CONJ( _b1, dp[1*8+1] );
		VAL_MSUB_CONJ( _b2, dp[2*8+1], _b1 );
		VAL_MSUB_CONJ( _b3, dp[3*8+1], _b1 );
		VAL_MSUB_CONJ( _b4, dp[4*8+1], _b1 );
		VAL_MSUB_CONJ( _b5, dp[5*8+1], _b1 );
		VAL_MSUB_CONJ( _b6, dp[6*8+1], _b1 );
		VAL_MSUB_CONJ( _b7, dp[7*8+1], _b1 );
		VAL_DIVEQ_CONJ( _b2, dp[2*8+2] );
		VAL_MSUB_CONJ( _b3, dp[3*8+2], _b2 );
		VAL_MSUB_CONJ( _b4, dp[4*8+2], _b2 );
		VAL_MSUB_CONJ( _b5, dp[5*8+2], _b2 );
		VAL_MSUB_CONJ( _b6, dp[6*8+2], _b2 );
		VAL_MSUB_CONJ( _b7, dp[7*8+2], _b2 );
		VAL_DIVEQ_CONJ( _b3, dp[3*8+3] );
		VAL_MSUB_CONJ( _b4, dp[4*8+3], _b3 );
		VAL_MSUB_CONJ( _b5, dp[5*8+3], _b3 );
		VAL_MSUB_CONJ( _b6, dp[6*8+3], _b3 );
		VAL_MSUB_CONJ( _b7, dp[7*8+3], _b3 );
		VAL_DIVEQ_CONJ( _b4, dp[4*8+4] );
		VAL_MSUB_CONJ( _b5, dp[5*8+4], _b4 );
		VAL_MSUB_CONJ( _b6, dp[6*8+4], _b4 );
		VAL_MSUB_CONJ( _b7, dp[7*8+4], _b4 );
		VAL_DIVEQ_CONJ( _b5, dp[5*8+5] );
		VAL_MSUB_CONJ( _b6, dp[6*8+5], _b5 );
		VAL_MSUB_CONJ( _b7, dp[7*8+5], _b5 );
		VAL_DIVEQ_CONJ( _b6, dp[6*8+6] );
		VAL_MSUB_CONJ( _b7, dp[7*8+6], _b6 );
		VAL_DIVEQ_CONJ( _b7, dp[7*8+7] );
		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1*incx], _b1 );
		VAL_ASSIGN( bp[2*incx], _b2 );
		VAL_ASSIGN( bp[3*incx], _b3 );
		VAL_ASSIGN( bp[4*incx], _b4 );
		VAL_ASSIGN( bp[5*incx], _b5 );
		VAL_ASSIGN( bp[6*incx], _b6 );
		VAL_ASSIGN( bp[7*incx], _b7 );
	} /* I */
} /* MBCSR_MatConjTrisolve_Lower_v1_aX_xsX */

#endif /* IS_VAL_COMPLEX */
/**
 *  \brief Exported module wrapper for the \f$8\times 7\f$
 *  implementation of the sparse triangular solve operation,
 *  where the matrix is lower triangular.
 */
static void
MatConjTrisolve_Lower( const oski_submatMBCSR_t* T,
	oski_value_t alpha, oski_vecview_t x )
{
	assert( T != NULL );
	assert( T->r == 8 );
	assert( T->c == 7 );
	assert( x != INVALID_VEC );

	if( x->rowinc == 1 ) {
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatConjTrisolve_Lower_v1_aX_xs1( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp );
	} else { /* x has general (non-unit) stride */
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatConjTrisolve_Lower_v1_aX_xsX( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp, x->rowinc );
	}
}

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatTransTrisolve_Lower_v1_aX_xs1. */
	#define MBCSR_MatTransTrisolve_Lower_v1_aX_xs1 \
		MANGLE_MOD_(MBCSR_MatTransTrisolve_Lower_v1_aX_xs1_8x7)
#endif

/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$L^{-T}\cdot b\f$, where x has unit-stride.
 */
void
MBCSR_MatTransTrisolve_Lower_v1_aX_xs1(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x )
{
	oski_index_t I;
	oski_value_t* bp = x + (d0 + (M-1)*8);
	const oski_value_t* dp = diag + (M-1)*8*8;

	{
		oski_index_t m = M * 8;
		oski_index_t incx = 1;
		BLAS_xSCAL( &m, &alpha, x, &incx );
	}

	for( I = M; I != 0; I--, bp -= 8, dp -= 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_ASSIGN( _b0, bp[0] );
		VAL_ASSIGN( _b1, bp[1] );
		VAL_ASSIGN( _b2, bp[2] );
		VAL_ASSIGN( _b3, bp[3] );
		VAL_ASSIGN( _b4, bp[4] );
		VAL_ASSIGN( _b5, bp[5] );
		VAL_ASSIGN( _b6, bp[6] );
		VAL_ASSIGN( _b7, bp[7] );

		VAL_DIVEQ( _b7, dp[7*8+7] );
		VAL_MSUB( _b0, dp[7*8+0], _b7 );
		VAL_MSUB( _b1, dp[7*8+1], _b7 );
		VAL_MSUB( _b2, dp[7*8+2], _b7 );
		VAL_MSUB( _b3, dp[7*8+3], _b7 );
		VAL_MSUB( _b4, dp[7*8+4], _b7 );
		VAL_MSUB( _b5, dp[7*8+5], _b7 );
		VAL_MSUB( _b6, dp[7*8+6], _b7 );
		VAL_DIVEQ( _b6, dp[6*8+6] );
		VAL_MSUB( _b0, dp[6*8+0], _b6 );
		VAL_MSUB( _b1, dp[6*8+1], _b6 );
		VAL_MSUB( _b2, dp[6*8+2], _b6 );
		VAL_MSUB( _b3, dp[6*8+3], _b6 );
		VAL_MSUB( _b4, dp[6*8+4], _b6 );
		VAL_MSUB( _b5, dp[6*8+5], _b6 );
		VAL_DIVEQ( _b5, dp[5*8+5] );
		VAL_MSUB( _b0, dp[5*8+0], _b5 );
		VAL_MSUB( _b1, dp[5*8+1], _b5 );
		VAL_MSUB( _b2, dp[5*8+2], _b5 );
		VAL_MSUB( _b3, dp[5*8+3], _b5 );
		VAL_MSUB( _b4, dp[5*8+4], _b5 );
		VAL_DIVEQ( _b4, dp[4*8+4] );
		VAL_MSUB( _b0, dp[4*8+0], _b4 );
		VAL_MSUB( _b1, dp[4*8+1], _b4 );
		VAL_MSUB( _b2, dp[4*8+2], _b4 );
		VAL_MSUB( _b3, dp[4*8+3], _b4 );
		VAL_DIVEQ( _b3, dp[3*8+3] );
		VAL_MSUB( _b0, dp[3*8+0], _b3 );
		VAL_MSUB( _b1, dp[3*8+1], _b3 );
		VAL_MSUB( _b2, dp[3*8+2], _b3 );
		VAL_DIVEQ( _b2, dp[2*8+2] );
		VAL_MSUB( _b0, dp[2*8+0], _b2 );
		VAL_MSUB( _b1, dp[2*8+1], _b2 );
		VAL_DIVEQ( _b1, dp[1*8+1] );
		VAL_MSUB( _b0, dp[1*8+0], _b1 );
		VAL_DIVEQ( _b0, dp[0*8+0] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			oski_value_t* xp = x + j0;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1] );
			VAL_ASSIGN( _x2, xp[2] );
			VAL_ASSIGN( _x3, xp[3] );
			VAL_ASSIGN( _x4, xp[4] );
			VAL_ASSIGN( _x5, xp[5] );
			VAL_ASSIGN( _x6, xp[6] );

			VAL_MSUB( _x0, vp[0], _b0 );
			VAL_MSUB( _x1, vp[1], _b0 );
			VAL_MSUB( _x2, vp[2], _b0 );
			VAL_MSUB( _x3, vp[3], _b0 );
			VAL_MSUB( _x4, vp[4], _b0 );
			VAL_MSUB( _x5, vp[5], _b0 );
			VAL_MSUB( _x6, vp[6], _b0 );
			VAL_MSUB( _x0, vp[7], _b1 );
			VAL_MSUB( _x1, vp[8], _b1 );
			VAL_MSUB( _x2, vp[9], _b1 );
			VAL_MSUB( _x3, vp[10], _b1 );
			VAL_MSUB( _x4, vp[11], _b1 );
			VAL_MSUB( _x5, vp[12], _b1 );
			VAL_MSUB( _x6, vp[13], _b1 );
			VAL_MSUB( _x0, vp[14], _b2 );
			VAL_MSUB( _x1, vp[15], _b2 );
			VAL_MSUB( _x2, vp[16], _b2 );
			VAL_MSUB( _x3, vp[17], _b2 );
			VAL_MSUB( _x4, vp[18], _b2 );
			VAL_MSUB( _x5, vp[19], _b2 );
			VAL_MSUB( _x6, vp[20], _b2 );
			VAL_MSUB( _x0, vp[21], _b3 );
			VAL_MSUB( _x1, vp[22], _b3 );
			VAL_MSUB( _x2, vp[23], _b3 );
			VAL_MSUB( _x3, vp[24], _b3 );
			VAL_MSUB( _x4, vp[25], _b3 );
			VAL_MSUB( _x5, vp[26], _b3 );
			VAL_MSUB( _x6, vp[27], _b3 );
			VAL_MSUB( _x0, vp[28], _b4 );
			VAL_MSUB( _x1, vp[29], _b4 );
			VAL_MSUB( _x2, vp[30], _b4 );
			VAL_MSUB( _x3, vp[31], _b4 );
			VAL_MSUB( _x4, vp[32], _b4 );
			VAL_MSUB( _x5, vp[33], _b4 );
			VAL_MSUB( _x6, vp[34], _b4 );
			VAL_MSUB( _x0, vp[35], _b5 );
			VAL_MSUB( _x1, vp[36], _b5 );
			VAL_MSUB( _x2, vp[37], _b5 );
			VAL_MSUB( _x3, vp[38], _b5 );
			VAL_MSUB( _x4, vp[39], _b5 );
			VAL_MSUB( _x5, vp[40], _b5 );
			VAL_MSUB( _x6, vp[41], _b5 );
			VAL_MSUB( _x0, vp[42], _b6 );
			VAL_MSUB( _x1, vp[43], _b6 );
			VAL_MSUB( _x2, vp[44], _b6 );
			VAL_MSUB( _x3, vp[45], _b6 );
			VAL_MSUB( _x4, vp[46], _b6 );
			VAL_MSUB( _x5, vp[47], _b6 );
			VAL_MSUB( _x6, vp[48], _b6 );
			VAL_MSUB( _x0, vp[49], _b7 );
			VAL_MSUB( _x1, vp[50], _b7 );
			VAL_MSUB( _x2, vp[51], _b7 );
			VAL_MSUB( _x3, vp[52], _b7 );
			VAL_MSUB( _x4, vp[53], _b7 );
			VAL_MSUB( _x5, vp[54], _b7 );
			VAL_MSUB( _x6, vp[55], _b7 );
			VAL_ASSIGN( xp[0], _x0 );
			VAL_ASSIGN( xp[1], _x1 );
			VAL_ASSIGN( xp[2], _x2 );
			VAL_ASSIGN( xp[3], _x3 );
			VAL_ASSIGN( xp[4], _x4 );
			VAL_ASSIGN( xp[5], _x5 );
			VAL_ASSIGN( xp[6], _x6 );
		} /* K */

		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1], _b1 );
		VAL_ASSIGN( bp[2], _b2 );
		VAL_ASSIGN( bp[3], _b3 );
		VAL_ASSIGN( bp[4], _b4 );
		VAL_ASSIGN( bp[5], _b5 );
		VAL_ASSIGN( bp[6], _b6 );
		VAL_ASSIGN( bp[7], _b7 );
	} /* I */
} /* MBCSR_MatTransTrisolve_Lower_v1_aX_xs1 */

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatTransTrisolve_Lower_v1_aX_xsX. */
	#define MBCSR_MatTransTrisolve_Lower_v1_aX_xsX \
		MANGLE_MOD_(MBCSR_MatTransTrisolve_Lower_v1_aX_xsX_8x7)
#endif

/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$L^{-T}\cdot b\f$, where x has general-stride.
 */
void
MBCSR_MatTransTrisolve_Lower_v1_aX_xsX(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x, oski_index_t incx )
{
	oski_index_t I;
	oski_value_t* bp = x + (d0 + (M-1)*8)*incx;
	const oski_value_t* dp = diag + (M-1)*8*8;

	{
		oski_index_t m = M * 8;
		BLAS_xSCAL( &m, &alpha, x, &incx );
	}

	for( I = M; I != 0; I--, bp -= 8*incx, dp -= 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_ASSIGN( _b0, bp[0] );
		VAL_ASSIGN( _b1, bp[1*incx] );
		VAL_ASSIGN( _b2, bp[2*incx] );
		VAL_ASSIGN( _b3, bp[3*incx] );
		VAL_ASSIGN( _b4, bp[4*incx] );
		VAL_ASSIGN( _b5, bp[5*incx] );
		VAL_ASSIGN( _b6, bp[6*incx] );
		VAL_ASSIGN( _b7, bp[7*incx] );

		VAL_DIVEQ( _b7, dp[7*8+7] );
		VAL_MSUB( _b0, dp[7*8+0], _b7 );
		VAL_MSUB( _b1, dp[7*8+1], _b7 );
		VAL_MSUB( _b2, dp[7*8+2], _b7 );
		VAL_MSUB( _b3, dp[7*8+3], _b7 );
		VAL_MSUB( _b4, dp[7*8+4], _b7 );
		VAL_MSUB( _b5, dp[7*8+5], _b7 );
		VAL_MSUB( _b6, dp[7*8+6], _b7 );
		VAL_DIVEQ( _b6, dp[6*8+6] );
		VAL_MSUB( _b0, dp[6*8+0], _b6 );
		VAL_MSUB( _b1, dp[6*8+1], _b6 );
		VAL_MSUB( _b2, dp[6*8+2], _b6 );
		VAL_MSUB( _b3, dp[6*8+3], _b6 );
		VAL_MSUB( _b4, dp[6*8+4], _b6 );
		VAL_MSUB( _b5, dp[6*8+5], _b6 );
		VAL_DIVEQ( _b5, dp[5*8+5] );
		VAL_MSUB( _b0, dp[5*8+0], _b5 );
		VAL_MSUB( _b1, dp[5*8+1], _b5 );
		VAL_MSUB( _b2, dp[5*8+2], _b5 );
		VAL_MSUB( _b3, dp[5*8+3], _b5 );
		VAL_MSUB( _b4, dp[5*8+4], _b5 );
		VAL_DIVEQ( _b4, dp[4*8+4] );
		VAL_MSUB( _b0, dp[4*8+0], _b4 );
		VAL_MSUB( _b1, dp[4*8+1], _b4 );
		VAL_MSUB( _b2, dp[4*8+2], _b4 );
		VAL_MSUB( _b3, dp[4*8+3], _b4 );
		VAL_DIVEQ( _b3, dp[3*8+3] );
		VAL_MSUB( _b0, dp[3*8+0], _b3 );
		VAL_MSUB( _b1, dp[3*8+1], _b3 );
		VAL_MSUB( _b2, dp[3*8+2], _b3 );
		VAL_DIVEQ( _b2, dp[2*8+2] );
		VAL_MSUB( _b0, dp[2*8+0], _b2 );
		VAL_MSUB( _b1, dp[2*8+1], _b2 );
		VAL_DIVEQ( _b1, dp[1*8+1] );
		VAL_MSUB( _b0, dp[1*8+0], _b1 );
		VAL_DIVEQ( _b0, dp[0*8+0] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			oski_value_t* xp = x + j0*incx;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1*incx] );
			VAL_ASSIGN( _x2, xp[2*incx] );
			VAL_ASSIGN( _x3, xp[3*incx] );
			VAL_ASSIGN( _x4, xp[4*incx] );
			VAL_ASSIGN( _x5, xp[5*incx] );
			VAL_ASSIGN( _x6, xp[6*incx] );

			VAL_MSUB( _x0, vp[0], _b0 );
			VAL_MSUB( _x1, vp[1], _b0 );
			VAL_MSUB( _x2, vp[2], _b0 );
			VAL_MSUB( _x3, vp[3], _b0 );
			VAL_MSUB( _x4, vp[4], _b0 );
			VAL_MSUB( _x5, vp[5], _b0 );
			VAL_MSUB( _x6, vp[6], _b0 );
			VAL_MSUB( _x0, vp[7], _b1 );
			VAL_MSUB( _x1, vp[8], _b1 );
			VAL_MSUB( _x2, vp[9], _b1 );
			VAL_MSUB( _x3, vp[10], _b1 );
			VAL_MSUB( _x4, vp[11], _b1 );
			VAL_MSUB( _x5, vp[12], _b1 );
			VAL_MSUB( _x6, vp[13], _b1 );
			VAL_MSUB( _x0, vp[14], _b2 );
			VAL_MSUB( _x1, vp[15], _b2 );
			VAL_MSUB( _x2, vp[16], _b2 );
			VAL_MSUB( _x3, vp[17], _b2 );
			VAL_MSUB( _x4, vp[18], _b2 );
			VAL_MSUB( _x5, vp[19], _b2 );
			VAL_MSUB( _x6, vp[20], _b2 );
			VAL_MSUB( _x0, vp[21], _b3 );
			VAL_MSUB( _x1, vp[22], _b3 );
			VAL_MSUB( _x2, vp[23], _b3 );
			VAL_MSUB( _x3, vp[24], _b3 );
			VAL_MSUB( _x4, vp[25], _b3 );
			VAL_MSUB( _x5, vp[26], _b3 );
			VAL_MSUB( _x6, vp[27], _b3 );
			VAL_MSUB( _x0, vp[28], _b4 );
			VAL_MSUB( _x1, vp[29], _b4 );
			VAL_MSUB( _x2, vp[30], _b4 );
			VAL_MSUB( _x3, vp[31], _b4 );
			VAL_MSUB( _x4, vp[32], _b4 );
			VAL_MSUB( _x5, vp[33], _b4 );
			VAL_MSUB( _x6, vp[34], _b4 );
			VAL_MSUB( _x0, vp[35], _b5 );
			VAL_MSUB( _x1, vp[36], _b5 );
			VAL_MSUB( _x2, vp[37], _b5 );
			VAL_MSUB( _x3, vp[38], _b5 );
			VAL_MSUB( _x4, vp[39], _b5 );
			VAL_MSUB( _x5, vp[40], _b5 );
			VAL_MSUB( _x6, vp[41], _b5 );
			VAL_MSUB( _x0, vp[42], _b6 );
			VAL_MSUB( _x1, vp[43], _b6 );
			VAL_MSUB( _x2, vp[44], _b6 );
			VAL_MSUB( _x3, vp[45], _b6 );
			VAL_MSUB( _x4, vp[46], _b6 );
			VAL_MSUB( _x5, vp[47], _b6 );
			VAL_MSUB( _x6, vp[48], _b6 );
			VAL_MSUB( _x0, vp[49], _b7 );
			VAL_MSUB( _x1, vp[50], _b7 );
			VAL_MSUB( _x2, vp[51], _b7 );
			VAL_MSUB( _x3, vp[52], _b7 );
			VAL_MSUB( _x4, vp[53], _b7 );
			VAL_MSUB( _x5, vp[54], _b7 );
			VAL_MSUB( _x6, vp[55], _b7 );
			VAL_ASSIGN( xp[0], _x0 );
			VAL_ASSIGN( xp[1*incx], _x1 );
			VAL_ASSIGN( xp[2*incx], _x2 );
			VAL_ASSIGN( xp[3*incx], _x3 );
			VAL_ASSIGN( xp[4*incx], _x4 );
			VAL_ASSIGN( xp[5*incx], _x5 );
			VAL_ASSIGN( xp[6*incx], _x6 );
		} /* K */

		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1*incx], _b1 );
		VAL_ASSIGN( bp[2*incx], _b2 );
		VAL_ASSIGN( bp[3*incx], _b3 );
		VAL_ASSIGN( bp[4*incx], _b4 );
		VAL_ASSIGN( bp[5*incx], _b5 );
		VAL_ASSIGN( bp[6*incx], _b6 );
		VAL_ASSIGN( bp[7*incx], _b7 );
	} /* I */
} /* MBCSR_MatTransTrisolve_Lower_v1_aX_xsX */

/**
 *  \brief Exported module wrapper for the \f$8\times 7\f$
 *  implementation of the sparse triangular solve operation,
 *  where the matrix is lower triangular.
 */
static void
MatTransTrisolve_Lower( const oski_submatMBCSR_t* T,
	oski_value_t alpha, oski_vecview_t x )
{
	assert( T != NULL );
	assert( T->r == 8 );
	assert( T->c == 7 );
	assert( x != INVALID_VEC );

	if( x->rowinc == 1 ) {
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatTransTrisolve_Lower_v1_aX_xs1( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp );
	} else { /* x has general (non-unit) stride */
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatTransTrisolve_Lower_v1_aX_xsX( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp, x->rowinc );
	}
}

#if !IS_VAL_COMPLEX
	/** Synonym for pure real version */
	#define MBCSR_MatHermTrisolve_Lower_v1_aX_xs1 MBCSR_MatTransTrisolve_Lower_v1_aX_xs1
#else /* IS_VAL_COMPLEX */

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatHermTrisolve_Lower_v1_aX_xs1. */
	#define MBCSR_MatHermTrisolve_Lower_v1_aX_xs1 \
		MANGLE_MOD_(MBCSR_MatHermTrisolve_Lower_v1_aX_xs1_8x7)
#endif

#endif
#if IS_VAL_COMPLEX
/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$\bar{L}^{-T}\cdot b\f$, where x has unit-stride.
 */
void
MBCSR_MatHermTrisolve_Lower_v1_aX_xs1(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x )
{
	oski_index_t I;
	oski_value_t* bp = x + (d0 + (M-1)*8);
	const oski_value_t* dp = diag + (M-1)*8*8;

	{
		oski_index_t m = M * 8;
		oski_index_t incx = 1;
		BLAS_xSCAL( &m, &alpha, x, &incx );
	}

	for( I = M; I != 0; I--, bp -= 8, dp -= 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_ASSIGN( _b0, bp[0] );
		VAL_ASSIGN( _b1, bp[1] );
		VAL_ASSIGN( _b2, bp[2] );
		VAL_ASSIGN( _b3, bp[3] );
		VAL_ASSIGN( _b4, bp[4] );
		VAL_ASSIGN( _b5, bp[5] );
		VAL_ASSIGN( _b6, bp[6] );
		VAL_ASSIGN( _b7, bp[7] );

		VAL_DIVEQ_CONJ( _b7, dp[7*8+7] );
		VAL_MSUB_CONJ( _b0, dp[7*8+0], _b7 );
		VAL_MSUB_CONJ( _b1, dp[7*8+1], _b7 );
		VAL_MSUB_CONJ( _b2, dp[7*8+2], _b7 );
		VAL_MSUB_CONJ( _b3, dp[7*8+3], _b7 );
		VAL_MSUB_CONJ( _b4, dp[7*8+4], _b7 );
		VAL_MSUB_CONJ( _b5, dp[7*8+5], _b7 );
		VAL_MSUB_CONJ( _b6, dp[7*8+6], _b7 );
		VAL_DIVEQ_CONJ( _b6, dp[6*8+6] );
		VAL_MSUB_CONJ( _b0, dp[6*8+0], _b6 );
		VAL_MSUB_CONJ( _b1, dp[6*8+1], _b6 );
		VAL_MSUB_CONJ( _b2, dp[6*8+2], _b6 );
		VAL_MSUB_CONJ( _b3, dp[6*8+3], _b6 );
		VAL_MSUB_CONJ( _b4, dp[6*8+4], _b6 );
		VAL_MSUB_CONJ( _b5, dp[6*8+5], _b6 );
		VAL_DIVEQ_CONJ( _b5, dp[5*8+5] );
		VAL_MSUB_CONJ( _b0, dp[5*8+0], _b5 );
		VAL_MSUB_CONJ( _b1, dp[5*8+1], _b5 );
		VAL_MSUB_CONJ( _b2, dp[5*8+2], _b5 );
		VAL_MSUB_CONJ( _b3, dp[5*8+3], _b5 );
		VAL_MSUB_CONJ( _b4, dp[5*8+4], _b5 );
		VAL_DIVEQ_CONJ( _b4, dp[4*8+4] );
		VAL_MSUB_CONJ( _b0, dp[4*8+0], _b4 );
		VAL_MSUB_CONJ( _b1, dp[4*8+1], _b4 );
		VAL_MSUB_CONJ( _b2, dp[4*8+2], _b4 );
		VAL_MSUB_CONJ( _b3, dp[4*8+3], _b4 );
		VAL_DIVEQ_CONJ( _b3, dp[3*8+3] );
		VAL_MSUB_CONJ( _b0, dp[3*8+0], _b3 );
		VAL_MSUB_CONJ( _b1, dp[3*8+1], _b3 );
		VAL_MSUB_CONJ( _b2, dp[3*8+2], _b3 );
		VAL_DIVEQ_CONJ( _b2, dp[2*8+2] );
		VAL_MSUB_CONJ( _b0, dp[2*8+0], _b2 );
		VAL_MSUB_CONJ( _b1, dp[2*8+1], _b2 );
		VAL_DIVEQ_CONJ( _b1, dp[1*8+1] );
		VAL_MSUB_CONJ( _b0, dp[1*8+0], _b1 );
		VAL_DIVEQ_CONJ( _b0, dp[0*8+0] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			oski_value_t* xp = x + j0;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1] );
			VAL_ASSIGN( _x2, xp[2] );
			VAL_ASSIGN( _x3, xp[3] );
			VAL_ASSIGN( _x4, xp[4] );
			VAL_ASSIGN( _x5, xp[5] );
			VAL_ASSIGN( _x6, xp[6] );

			VAL_MSUB_CONJ( _x0, vp[0], _b0 );
			VAL_MSUB_CONJ( _x1, vp[1], _b0 );
			VAL_MSUB_CONJ( _x2, vp[2], _b0 );
			VAL_MSUB_CONJ( _x3, vp[3], _b0 );
			VAL_MSUB_CONJ( _x4, vp[4], _b0 );
			VAL_MSUB_CONJ( _x5, vp[5], _b0 );
			VAL_MSUB_CONJ( _x6, vp[6], _b0 );
			VAL_MSUB_CONJ( _x0, vp[7], _b1 );
			VAL_MSUB_CONJ( _x1, vp[8], _b1 );
			VAL_MSUB_CONJ( _x2, vp[9], _b1 );
			VAL_MSUB_CONJ( _x3, vp[10], _b1 );
			VAL_MSUB_CONJ( _x4, vp[11], _b1 );
			VAL_MSUB_CONJ( _x5, vp[12], _b1 );
			VAL_MSUB_CONJ( _x6, vp[13], _b1 );
			VAL_MSUB_CONJ( _x0, vp[14], _b2 );
			VAL_MSUB_CONJ( _x1, vp[15], _b2 );
			VAL_MSUB_CONJ( _x2, vp[16], _b2 );
			VAL_MSUB_CONJ( _x3, vp[17], _b2 );
			VAL_MSUB_CONJ( _x4, vp[18], _b2 );
			VAL_MSUB_CONJ( _x5, vp[19], _b2 );
			VAL_MSUB_CONJ( _x6, vp[20], _b2 );
			VAL_MSUB_CONJ( _x0, vp[21], _b3 );
			VAL_MSUB_CONJ( _x1, vp[22], _b3 );
			VAL_MSUB_CONJ( _x2, vp[23], _b3 );
			VAL_MSUB_CONJ( _x3, vp[24], _b3 );
			VAL_MSUB_CONJ( _x4, vp[25], _b3 );
			VAL_MSUB_CONJ( _x5, vp[26], _b3 );
			VAL_MSUB_CONJ( _x6, vp[27], _b3 );
			VAL_MSUB_CONJ( _x0, vp[28], _b4 );
			VAL_MSUB_CONJ( _x1, vp[29], _b4 );
			VAL_MSUB_CONJ( _x2, vp[30], _b4 );
			VAL_MSUB_CONJ( _x3, vp[31], _b4 );
			VAL_MSUB_CONJ( _x4, vp[32], _b4 );
			VAL_MSUB_CONJ( _x5, vp[33], _b4 );
			VAL_MSUB_CONJ( _x6, vp[34], _b4 );
			VAL_MSUB_CONJ( _x0, vp[35], _b5 );
			VAL_MSUB_CONJ( _x1, vp[36], _b5 );
			VAL_MSUB_CONJ( _x2, vp[37], _b5 );
			VAL_MSUB_CONJ( _x3, vp[38], _b5 );
			VAL_MSUB_CONJ( _x4, vp[39], _b5 );
			VAL_MSUB_CONJ( _x5, vp[40], _b5 );
			VAL_MSUB_CONJ( _x6, vp[41], _b5 );
			VAL_MSUB_CONJ( _x0, vp[42], _b6 );
			VAL_MSUB_CONJ( _x1, vp[43], _b6 );
			VAL_MSUB_CONJ( _x2, vp[44], _b6 );
			VAL_MSUB_CONJ( _x3, vp[45], _b6 );
			VAL_MSUB_CONJ( _x4, vp[46], _b6 );
			VAL_MSUB_CONJ( _x5, vp[47], _b6 );
			VAL_MSUB_CONJ( _x6, vp[48], _b6 );
			VAL_MSUB_CONJ( _x0, vp[49], _b7 );
			VAL_MSUB_CONJ( _x1, vp[50], _b7 );
			VAL_MSUB_CONJ( _x2, vp[51], _b7 );
			VAL_MSUB_CONJ( _x3, vp[52], _b7 );
			VAL_MSUB_CONJ( _x4, vp[53], _b7 );
			VAL_MSUB_CONJ( _x5, vp[54], _b7 );
			VAL_MSUB_CONJ( _x6, vp[55], _b7 );
			VAL_ASSIGN( xp[0], _x0 );
			VAL_ASSIGN( xp[1], _x1 );
			VAL_ASSIGN( xp[2], _x2 );
			VAL_ASSIGN( xp[3], _x3 );
			VAL_ASSIGN( xp[4], _x4 );
			VAL_ASSIGN( xp[5], _x5 );
			VAL_ASSIGN( xp[6], _x6 );
		} /* K */

		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1], _b1 );
		VAL_ASSIGN( bp[2], _b2 );
		VAL_ASSIGN( bp[3], _b3 );
		VAL_ASSIGN( bp[4], _b4 );
		VAL_ASSIGN( bp[5], _b5 );
		VAL_ASSIGN( bp[6], _b6 );
		VAL_ASSIGN( bp[7], _b7 );
	} /* I */
} /* MBCSR_MatHermTrisolve_Lower_v1_aX_xs1 */

#endif /* IS_VAL_COMPLEX */
#if !IS_VAL_COMPLEX
	/** Synonym for pure real version */
	#define MBCSR_MatHermTrisolve_Lower_v1_aX_xsX MBCSR_MatTransTrisolve_Lower_v1_aX_xsX
#else /* IS_VAL_COMPLEX */

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatHermTrisolve_Lower_v1_aX_xsX. */
	#define MBCSR_MatHermTrisolve_Lower_v1_aX_xsX \
		MANGLE_MOD_(MBCSR_MatHermTrisolve_Lower_v1_aX_xsX_8x7)
#endif

#endif
#if IS_VAL_COMPLEX
/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$\bar{L}^{-T}\cdot b\f$, where x has general-stride.
 */
void
MBCSR_MatHermTrisolve_Lower_v1_aX_xsX(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x, oski_index_t incx )
{
	oski_index_t I;
	oski_value_t* bp = x + (d0 + (M-1)*8)*incx;
	const oski_value_t* dp = diag + (M-1)*8*8;

	{
		oski_index_t m = M * 8;
		BLAS_xSCAL( &m, &alpha, x, &incx );
	}

	for( I = M; I != 0; I--, bp -= 8*incx, dp -= 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_ASSIGN( _b0, bp[0] );
		VAL_ASSIGN( _b1, bp[1*incx] );
		VAL_ASSIGN( _b2, bp[2*incx] );
		VAL_ASSIGN( _b3, bp[3*incx] );
		VAL_ASSIGN( _b4, bp[4*incx] );
		VAL_ASSIGN( _b5, bp[5*incx] );
		VAL_ASSIGN( _b6, bp[6*incx] );
		VAL_ASSIGN( _b7, bp[7*incx] );

		VAL_DIVEQ_CONJ( _b7, dp[7*8+7] );
		VAL_MSUB_CONJ( _b0, dp[7*8+0], _b7 );
		VAL_MSUB_CONJ( _b1, dp[7*8+1], _b7 );
		VAL_MSUB_CONJ( _b2, dp[7*8+2], _b7 );
		VAL_MSUB_CONJ( _b3, dp[7*8+3], _b7 );
		VAL_MSUB_CONJ( _b4, dp[7*8+4], _b7 );
		VAL_MSUB_CONJ( _b5, dp[7*8+5], _b7 );
		VAL_MSUB_CONJ( _b6, dp[7*8+6], _b7 );
		VAL_DIVEQ_CONJ( _b6, dp[6*8+6] );
		VAL_MSUB_CONJ( _b0, dp[6*8+0], _b6 );
		VAL_MSUB_CONJ( _b1, dp[6*8+1], _b6 );
		VAL_MSUB_CONJ( _b2, dp[6*8+2], _b6 );
		VAL_MSUB_CONJ( _b3, dp[6*8+3], _b6 );
		VAL_MSUB_CONJ( _b4, dp[6*8+4], _b6 );
		VAL_MSUB_CONJ( _b5, dp[6*8+5], _b6 );
		VAL_DIVEQ_CONJ( _b5, dp[5*8+5] );
		VAL_MSUB_CONJ( _b0, dp[5*8+0], _b5 );
		VAL_MSUB_CONJ( _b1, dp[5*8+1], _b5 );
		VAL_MSUB_CONJ( _b2, dp[5*8+2], _b5 );
		VAL_MSUB_CONJ( _b3, dp[5*8+3], _b5 );
		VAL_MSUB_CONJ( _b4, dp[5*8+4], _b5 );
		VAL_DIVEQ_CONJ( _b4, dp[4*8+4] );
		VAL_MSUB_CONJ( _b0, dp[4*8+0], _b4 );
		VAL_MSUB_CONJ( _b1, dp[4*8+1], _b4 );
		VAL_MSUB_CONJ( _b2, dp[4*8+2], _b4 );
		VAL_MSUB_CONJ( _b3, dp[4*8+3], _b4 );
		VAL_DIVEQ_CONJ( _b3, dp[3*8+3] );
		VAL_MSUB_CONJ( _b0, dp[3*8+0], _b3 );
		VAL_MSUB_CONJ( _b1, dp[3*8+1], _b3 );
		VAL_MSUB_CONJ( _b2, dp[3*8+2], _b3 );
		VAL_DIVEQ_CONJ( _b2, dp[2*8+2] );
		VAL_MSUB_CONJ( _b0, dp[2*8+0], _b2 );
		VAL_MSUB_CONJ( _b1, dp[2*8+1], _b2 );
		VAL_DIVEQ_CONJ( _b1, dp[1*8+1] );
		VAL_MSUB_CONJ( _b0, dp[1*8+0], _b1 );
		VAL_DIVEQ_CONJ( _b0, dp[0*8+0] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			oski_value_t* xp = x + j0*incx;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1*incx] );
			VAL_ASSIGN( _x2, xp[2*incx] );
			VAL_ASSIGN( _x3, xp[3*incx] );
			VAL_ASSIGN( _x4, xp[4*incx] );
			VAL_ASSIGN( _x5, xp[5*incx] );
			VAL_ASSIGN( _x6, xp[6*incx] );

			VAL_MSUB_CONJ( _x0, vp[0], _b0 );
			VAL_MSUB_CONJ( _x1, vp[1], _b0 );
			VAL_MSUB_CONJ( _x2, vp[2], _b0 );
			VAL_MSUB_CONJ( _x3, vp[3], _b0 );
			VAL_MSUB_CONJ( _x4, vp[4], _b0 );
			VAL_MSUB_CONJ( _x5, vp[5], _b0 );
			VAL_MSUB_CONJ( _x6, vp[6], _b0 );
			VAL_MSUB_CONJ( _x0, vp[7], _b1 );
			VAL_MSUB_CONJ( _x1, vp[8], _b1 );
			VAL_MSUB_CONJ( _x2, vp[9], _b1 );
			VAL_MSUB_CONJ( _x3, vp[10], _b1 );
			VAL_MSUB_CONJ( _x4, vp[11], _b1 );
			VAL_MSUB_CONJ( _x5, vp[12], _b1 );
			VAL_MSUB_CONJ( _x6, vp[13], _b1 );
			VAL_MSUB_CONJ( _x0, vp[14], _b2 );
			VAL_MSUB_CONJ( _x1, vp[15], _b2 );
			VAL_MSUB_CONJ( _x2, vp[16], _b2 );
			VAL_MSUB_CONJ( _x3, vp[17], _b2 );
			VAL_MSUB_CONJ( _x4, vp[18], _b2 );
			VAL_MSUB_CONJ( _x5, vp[19], _b2 );
			VAL_MSUB_CONJ( _x6, vp[20], _b2 );
			VAL_MSUB_CONJ( _x0, vp[21], _b3 );
			VAL_MSUB_CONJ( _x1, vp[22], _b3 );
			VAL_MSUB_CONJ( _x2, vp[23], _b3 );
			VAL_MSUB_CONJ( _x3, vp[24], _b3 );
			VAL_MSUB_CONJ( _x4, vp[25], _b3 );
			VAL_MSUB_CONJ( _x5, vp[26], _b3 );
			VAL_MSUB_CONJ( _x6, vp[27], _b3 );
			VAL_MSUB_CONJ( _x0, vp[28], _b4 );
			VAL_MSUB_CONJ( _x1, vp[29], _b4 );
			VAL_MSUB_CONJ( _x2, vp[30], _b4 );
			VAL_MSUB_CONJ( _x3, vp[31], _b4 );
			VAL_MSUB_CONJ( _x4, vp[32], _b4 );
			VAL_MSUB_CONJ( _x5, vp[33], _b4 );
			VAL_MSUB_CONJ( _x6, vp[34], _b4 );
			VAL_MSUB_CONJ( _x0, vp[35], _b5 );
			VAL_MSUB_CONJ( _x1, vp[36], _b5 );
			VAL_MSUB_CONJ( _x2, vp[37], _b5 );
			VAL_MSUB_CONJ( _x3, vp[38], _b5 );
			VAL_MSUB_CONJ( _x4, vp[39], _b5 );
			VAL_MSUB_CONJ( _x5, vp[40], _b5 );
			VAL_MSUB_CONJ( _x6, vp[41], _b5 );
			VAL_MSUB_CONJ( _x0, vp[42], _b6 );
			VAL_MSUB_CONJ( _x1, vp[43], _b6 );
			VAL_MSUB_CONJ( _x2, vp[44], _b6 );
			VAL_MSUB_CONJ( _x3, vp[45], _b6 );
			VAL_MSUB_CONJ( _x4, vp[46], _b6 );
			VAL_MSUB_CONJ( _x5, vp[47], _b6 );
			VAL_MSUB_CONJ( _x6, vp[48], _b6 );
			VAL_MSUB_CONJ( _x0, vp[49], _b7 );
			VAL_MSUB_CONJ( _x1, vp[50], _b7 );
			VAL_MSUB_CONJ( _x2, vp[51], _b7 );
			VAL_MSUB_CONJ( _x3, vp[52], _b7 );
			VAL_MSUB_CONJ( _x4, vp[53], _b7 );
			VAL_MSUB_CONJ( _x5, vp[54], _b7 );
			VAL_MSUB_CONJ( _x6, vp[55], _b7 );
			VAL_ASSIGN( xp[0], _x0 );
			VAL_ASSIGN( xp[1*incx], _x1 );
			VAL_ASSIGN( xp[2*incx], _x2 );
			VAL_ASSIGN( xp[3*incx], _x3 );
			VAL_ASSIGN( xp[4*incx], _x4 );
			VAL_ASSIGN( xp[5*incx], _x5 );
			VAL_ASSIGN( xp[6*incx], _x6 );
		} /* K */

		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1*incx], _b1 );
		VAL_ASSIGN( bp[2*incx], _b2 );
		VAL_ASSIGN( bp[3*incx], _b3 );
		VAL_ASSIGN( bp[4*incx], _b4 );
		VAL_ASSIGN( bp[5*incx], _b5 );
		VAL_ASSIGN( bp[6*incx], _b6 );
		VAL_ASSIGN( bp[7*incx], _b7 );
	} /* I */
} /* MBCSR_MatHermTrisolve_Lower_v1_aX_xsX */

#endif /* IS_VAL_COMPLEX */
/**
 *  \brief Exported module wrapper for the \f$8\times 7\f$
 *  implementation of the sparse triangular solve operation,
 *  where the matrix is lower triangular.
 */
static void
MatHermTrisolve_Lower( const oski_submatMBCSR_t* T,
	oski_value_t alpha, oski_vecview_t x )
{
	assert( T != NULL );
	assert( T->r == 8 );
	assert( T->c == 7 );
	assert( x != INVALID_VEC );

	if( x->rowinc == 1 ) {
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatHermTrisolve_Lower_v1_aX_xs1( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp );
	} else { /* x has general (non-unit) stride */
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatHermTrisolve_Lower_v1_aX_xsX( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp, x->rowinc );
	}
}

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatTrisolve_Upper_v1_aX_xs1. */
	#define MBCSR_MatTrisolve_Upper_v1_aX_xs1 \
		MANGLE_MOD_(MBCSR_MatTrisolve_Upper_v1_aX_xs1_8x7)
#endif

/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$U^{-1}\cdot b\f$, where x has unit-stride.
 */
void
MBCSR_MatTrisolve_Upper_v1_aX_xs1(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x )
{
	oski_index_t I;
	oski_value_t* bp = x + (d0 + (M-1)*8);
	const oski_value_t* dp = diag + (M-1)*8*8;

	for( I = M; I != 0; I--, bp -= 8, dp -= 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_MUL( _b0, alpha, bp[0] );
		VAL_MUL( _b1, alpha, bp[1] );
		VAL_MUL( _b2, alpha, bp[2] );
		VAL_MUL( _b3, alpha, bp[3] );
		VAL_MUL( _b4, alpha, bp[4] );
		VAL_MUL( _b5, alpha, bp[5] );
		VAL_MUL( _b6, alpha, bp[6] );
		VAL_MUL( _b7, alpha, bp[7] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			const oski_value_t* xp = x + j0;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1] );
			VAL_ASSIGN( _x2, xp[2] );
			VAL_ASSIGN( _x3, xp[3] );
			VAL_ASSIGN( _x4, xp[4] );
			VAL_ASSIGN( _x5, xp[5] );
			VAL_ASSIGN( _x6, xp[6] );

			VAL_MSUB( _b0, vp[0], _x0 );
			VAL_MSUB( _b1, vp[7], _x0 );
			VAL_MSUB( _b2, vp[14], _x0 );
			VAL_MSUB( _b3, vp[21], _x0 );
			VAL_MSUB( _b4, vp[28], _x0 );
			VAL_MSUB( _b5, vp[35], _x0 );
			VAL_MSUB( _b6, vp[42], _x0 );
			VAL_MSUB( _b7, vp[49], _x0 );
			VAL_MSUB( _b0, vp[1], _x1 );
			VAL_MSUB( _b1, vp[8], _x1 );
			VAL_MSUB( _b2, vp[15], _x1 );
			VAL_MSUB( _b3, vp[22], _x1 );
			VAL_MSUB( _b4, vp[29], _x1 );
			VAL_MSUB( _b5, vp[36], _x1 );
			VAL_MSUB( _b6, vp[43], _x1 );
			VAL_MSUB( _b7, vp[50], _x1 );
			VAL_MSUB( _b0, vp[2], _x2 );
			VAL_MSUB( _b1, vp[9], _x2 );
			VAL_MSUB( _b2, vp[16], _x2 );
			VAL_MSUB( _b3, vp[23], _x2 );
			VAL_MSUB( _b4, vp[30], _x2 );
			VAL_MSUB( _b5, vp[37], _x2 );
			VAL_MSUB( _b6, vp[44], _x2 );
			VAL_MSUB( _b7, vp[51], _x2 );
			VAL_MSUB( _b0, vp[3], _x3 );
			VAL_MSUB( _b1, vp[10], _x3 );
			VAL_MSUB( _b2, vp[17], _x3 );
			VAL_MSUB( _b3, vp[24], _x3 );
			VAL_MSUB( _b4, vp[31], _x3 );
			VAL_MSUB( _b5, vp[38], _x3 );
			VAL_MSUB( _b6, vp[45], _x3 );
			VAL_MSUB( _b7, vp[52], _x3 );
			VAL_MSUB( _b0, vp[4], _x4 );
			VAL_MSUB( _b1, vp[11], _x4 );
			VAL_MSUB( _b2, vp[18], _x4 );
			VAL_MSUB( _b3, vp[25], _x4 );
			VAL_MSUB( _b4, vp[32], _x4 );
			VAL_MSUB( _b5, vp[39], _x4 );
			VAL_MSUB( _b6, vp[46], _x4 );
			VAL_MSUB( _b7, vp[53], _x4 );
			VAL_MSUB( _b0, vp[5], _x5 );
			VAL_MSUB( _b1, vp[12], _x5 );
			VAL_MSUB( _b2, vp[19], _x5 );
			VAL_MSUB( _b3, vp[26], _x5 );
			VAL_MSUB( _b4, vp[33], _x5 );
			VAL_MSUB( _b5, vp[40], _x5 );
			VAL_MSUB( _b6, vp[47], _x5 );
			VAL_MSUB( _b7, vp[54], _x5 );
			VAL_MSUB( _b0, vp[6], _x6 );
			VAL_MSUB( _b1, vp[13], _x6 );
			VAL_MSUB( _b2, vp[20], _x6 );
			VAL_MSUB( _b3, vp[27], _x6 );
			VAL_MSUB( _b4, vp[34], _x6 );
			VAL_MSUB( _b5, vp[41], _x6 );
			VAL_MSUB( _b6, vp[48], _x6 );
			VAL_MSUB( _b7, vp[55], _x6 );
		} /* K */

		VAL_DIVEQ( _b7, dp[7*8+7] );
		VAL_MSUB( _b0, dp[0*8+7], _b7 );
		VAL_MSUB( _b1, dp[1*8+7], _b7 );
		VAL_MSUB( _b2, dp[2*8+7], _b7 );
		VAL_MSUB( _b3, dp[3*8+7], _b7 );
		VAL_MSUB( _b4, dp[4*8+7], _b7 );
		VAL_MSUB( _b5, dp[5*8+7], _b7 );
		VAL_MSUB( _b6, dp[6*8+7], _b7 );
		VAL_DIVEQ( _b6, dp[6*8+6] );
		VAL_MSUB( _b0, dp[0*8+6], _b6 );
		VAL_MSUB( _b1, dp[1*8+6], _b6 );
		VAL_MSUB( _b2, dp[2*8+6], _b6 );
		VAL_MSUB( _b3, dp[3*8+6], _b6 );
		VAL_MSUB( _b4, dp[4*8+6], _b6 );
		VAL_MSUB( _b5, dp[5*8+6], _b6 );
		VAL_DIVEQ( _b5, dp[5*8+5] );
		VAL_MSUB( _b0, dp[0*8+5], _b5 );
		VAL_MSUB( _b1, dp[1*8+5], _b5 );
		VAL_MSUB( _b2, dp[2*8+5], _b5 );
		VAL_MSUB( _b3, dp[3*8+5], _b5 );
		VAL_MSUB( _b4, dp[4*8+5], _b5 );
		VAL_DIVEQ( _b4, dp[4*8+4] );
		VAL_MSUB( _b0, dp[0*8+4], _b4 );
		VAL_MSUB( _b1, dp[1*8+4], _b4 );
		VAL_MSUB( _b2, dp[2*8+4], _b4 );
		VAL_MSUB( _b3, dp[3*8+4], _b4 );
		VAL_DIVEQ( _b3, dp[3*8+3] );
		VAL_MSUB( _b0, dp[0*8+3], _b3 );
		VAL_MSUB( _b1, dp[1*8+3], _b3 );
		VAL_MSUB( _b2, dp[2*8+3], _b3 );
		VAL_DIVEQ( _b2, dp[2*8+2] );
		VAL_MSUB( _b0, dp[0*8+2], _b2 );
		VAL_MSUB( _b1, dp[1*8+2], _b2 );
		VAL_DIVEQ( _b1, dp[1*8+1] );
		VAL_MSUB( _b0, dp[0*8+1], _b1 );
		VAL_DIVEQ( _b0, dp[0*8+0] );
		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1], _b1 );
		VAL_ASSIGN( bp[2], _b2 );
		VAL_ASSIGN( bp[3], _b3 );
		VAL_ASSIGN( bp[4], _b4 );
		VAL_ASSIGN( bp[5], _b5 );
		VAL_ASSIGN( bp[6], _b6 );
		VAL_ASSIGN( bp[7], _b7 );
	} /* I */
} /* MBCSR_MatTrisolve_Upper_v1_aX_xs1 */

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatTrisolve_Upper_v1_aX_xsX. */
	#define MBCSR_MatTrisolve_Upper_v1_aX_xsX \
		MANGLE_MOD_(MBCSR_MatTrisolve_Upper_v1_aX_xsX_8x7)
#endif

/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$U^{-1}\cdot b\f$, where x has general-stride.
 */
void
MBCSR_MatTrisolve_Upper_v1_aX_xsX(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x, oski_index_t incx )
{
	oski_index_t I;
	oski_value_t* bp = x + (d0 + (M-1)*8)*incx;
	const oski_value_t* dp = diag + (M-1)*8*8;

	for( I = M; I != 0; I--, bp -= 8*incx, dp -= 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_MUL( _b0, alpha, bp[0] );
		VAL_MUL( _b1, alpha, bp[1*incx] );
		VAL_MUL( _b2, alpha, bp[2*incx] );
		VAL_MUL( _b3, alpha, bp[3*incx] );
		VAL_MUL( _b4, alpha, bp[4*incx] );
		VAL_MUL( _b5, alpha, bp[5*incx] );
		VAL_MUL( _b6, alpha, bp[6*incx] );
		VAL_MUL( _b7, alpha, bp[7*incx] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			const oski_value_t* xp = x + j0*incx;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1*incx] );
			VAL_ASSIGN( _x2, xp[2*incx] );
			VAL_ASSIGN( _x3, xp[3*incx] );
			VAL_ASSIGN( _x4, xp[4*incx] );
			VAL_ASSIGN( _x5, xp[5*incx] );
			VAL_ASSIGN( _x6, xp[6*incx] );

			VAL_MSUB( _b0, vp[0], _x0 );
			VAL_MSUB( _b1, vp[7], _x0 );
			VAL_MSUB( _b2, vp[14], _x0 );
			VAL_MSUB( _b3, vp[21], _x0 );
			VAL_MSUB( _b4, vp[28], _x0 );
			VAL_MSUB( _b5, vp[35], _x0 );
			VAL_MSUB( _b6, vp[42], _x0 );
			VAL_MSUB( _b7, vp[49], _x0 );
			VAL_MSUB( _b0, vp[1], _x1 );
			VAL_MSUB( _b1, vp[8], _x1 );
			VAL_MSUB( _b2, vp[15], _x1 );
			VAL_MSUB( _b3, vp[22], _x1 );
			VAL_MSUB( _b4, vp[29], _x1 );
			VAL_MSUB( _b5, vp[36], _x1 );
			VAL_MSUB( _b6, vp[43], _x1 );
			VAL_MSUB( _b7, vp[50], _x1 );
			VAL_MSUB( _b0, vp[2], _x2 );
			VAL_MSUB( _b1, vp[9], _x2 );
			VAL_MSUB( _b2, vp[16], _x2 );
			VAL_MSUB( _b3, vp[23], _x2 );
			VAL_MSUB( _b4, vp[30], _x2 );
			VAL_MSUB( _b5, vp[37], _x2 );
			VAL_MSUB( _b6, vp[44], _x2 );
			VAL_MSUB( _b7, vp[51], _x2 );
			VAL_MSUB( _b0, vp[3], _x3 );
			VAL_MSUB( _b1, vp[10], _x3 );
			VAL_MSUB( _b2, vp[17], _x3 );
			VAL_MSUB( _b3, vp[24], _x3 );
			VAL_MSUB( _b4, vp[31], _x3 );
			VAL_MSUB( _b5, vp[38], _x3 );
			VAL_MSUB( _b6, vp[45], _x3 );
			VAL_MSUB( _b7, vp[52], _x3 );
			VAL_MSUB( _b0, vp[4], _x4 );
			VAL_MSUB( _b1, vp[11], _x4 );
			VAL_MSUB( _b2, vp[18], _x4 );
			VAL_MSUB( _b3, vp[25], _x4 );
			VAL_MSUB( _b4, vp[32], _x4 );
			VAL_MSUB( _b5, vp[39], _x4 );
			VAL_MSUB( _b6, vp[46], _x4 );
			VAL_MSUB( _b7, vp[53], _x4 );
			VAL_MSUB( _b0, vp[5], _x5 );
			VAL_MSUB( _b1, vp[12], _x5 );
			VAL_MSUB( _b2, vp[19], _x5 );
			VAL_MSUB( _b3, vp[26], _x5 );
			VAL_MSUB( _b4, vp[33], _x5 );
			VAL_MSUB( _b5, vp[40], _x5 );
			VAL_MSUB( _b6, vp[47], _x5 );
			VAL_MSUB( _b7, vp[54], _x5 );
			VAL_MSUB( _b0, vp[6], _x6 );
			VAL_MSUB( _b1, vp[13], _x6 );
			VAL_MSUB( _b2, vp[20], _x6 );
			VAL_MSUB( _b3, vp[27], _x6 );
			VAL_MSUB( _b4, vp[34], _x6 );
			VAL_MSUB( _b5, vp[41], _x6 );
			VAL_MSUB( _b6, vp[48], _x6 );
			VAL_MSUB( _b7, vp[55], _x6 );
		} /* K */

		VAL_DIVEQ( _b7, dp[7*8+7] );
		VAL_MSUB( _b0, dp[0*8+7], _b7 );
		VAL_MSUB( _b1, dp[1*8+7], _b7 );
		VAL_MSUB( _b2, dp[2*8+7], _b7 );
		VAL_MSUB( _b3, dp[3*8+7], _b7 );
		VAL_MSUB( _b4, dp[4*8+7], _b7 );
		VAL_MSUB( _b5, dp[5*8+7], _b7 );
		VAL_MSUB( _b6, dp[6*8+7], _b7 );
		VAL_DIVEQ( _b6, dp[6*8+6] );
		VAL_MSUB( _b0, dp[0*8+6], _b6 );
		VAL_MSUB( _b1, dp[1*8+6], _b6 );
		VAL_MSUB( _b2, dp[2*8+6], _b6 );
		VAL_MSUB( _b3, dp[3*8+6], _b6 );
		VAL_MSUB( _b4, dp[4*8+6], _b6 );
		VAL_MSUB( _b5, dp[5*8+6], _b6 );
		VAL_DIVEQ( _b5, dp[5*8+5] );
		VAL_MSUB( _b0, dp[0*8+5], _b5 );
		VAL_MSUB( _b1, dp[1*8+5], _b5 );
		VAL_MSUB( _b2, dp[2*8+5], _b5 );
		VAL_MSUB( _b3, dp[3*8+5], _b5 );
		VAL_MSUB( _b4, dp[4*8+5], _b5 );
		VAL_DIVEQ( _b4, dp[4*8+4] );
		VAL_MSUB( _b0, dp[0*8+4], _b4 );
		VAL_MSUB( _b1, dp[1*8+4], _b4 );
		VAL_MSUB( _b2, dp[2*8+4], _b4 );
		VAL_MSUB( _b3, dp[3*8+4], _b4 );
		VAL_DIVEQ( _b3, dp[3*8+3] );
		VAL_MSUB( _b0, dp[0*8+3], _b3 );
		VAL_MSUB( _b1, dp[1*8+3], _b3 );
		VAL_MSUB( _b2, dp[2*8+3], _b3 );
		VAL_DIVEQ( _b2, dp[2*8+2] );
		VAL_MSUB( _b0, dp[0*8+2], _b2 );
		VAL_MSUB( _b1, dp[1*8+2], _b2 );
		VAL_DIVEQ( _b1, dp[1*8+1] );
		VAL_MSUB( _b0, dp[0*8+1], _b1 );
		VAL_DIVEQ( _b0, dp[0*8+0] );
		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1*incx], _b1 );
		VAL_ASSIGN( bp[2*incx], _b2 );
		VAL_ASSIGN( bp[3*incx], _b3 );
		VAL_ASSIGN( bp[4*incx], _b4 );
		VAL_ASSIGN( bp[5*incx], _b5 );
		VAL_ASSIGN( bp[6*incx], _b6 );
		VAL_ASSIGN( bp[7*incx], _b7 );
	} /* I */
} /* MBCSR_MatTrisolve_Upper_v1_aX_xsX */

/**
 *  \brief Exported module wrapper for the \f$8\times 7\f$
 *  implementation of the sparse triangular solve operation,
 *  where the matrix is upper triangular.
 */
static void
MatTrisolve_Upper( const oski_submatMBCSR_t* T,
	oski_value_t alpha, oski_vecview_t x )
{
	assert( T != NULL );
	assert( T->r == 8 );
	assert( T->c == 7 );
	assert( x != INVALID_VEC );

	if( x->rowinc == 1 ) {
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatTrisolve_Upper_v1_aX_xs1( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp );
	} else { /* x has general (non-unit) stride */
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatTrisolve_Upper_v1_aX_xsX( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp, x->rowinc );
	}
}

#if !IS_VAL_COMPLEX
	/** Synonym for pure real version */
	#define MBCSR_MatConjTrisolve_Upper_v1_aX_xs1 MBCSR_MatTrisolve_Upper_v1_aX_xs1
#else /* IS_VAL_COMPLEX */

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatConjTrisolve_Upper_v1_aX_xs1. */
	#define MBCSR_MatConjTrisolve_Upper_v1_aX_xs1 \
		MANGLE_MOD_(MBCSR_MatConjTrisolve_Upper_v1_aX_xs1_8x7)
#endif

#endif
#if IS_VAL_COMPLEX
/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$\bar{U}^{-1}\cdot b\f$, where x has unit-stride.
 */
void
MBCSR_MatConjTrisolve_Upper_v1_aX_xs1(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x )
{
	oski_index_t I;
	oski_value_t* bp = x + (d0 + (M-1)*8);
	const oski_value_t* dp = diag + (M-1)*8*8;

	for( I = M; I != 0; I--, bp -= 8, dp -= 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_MUL( _b0, alpha, bp[0] );
		VAL_MUL( _b1, alpha, bp[1] );
		VAL_MUL( _b2, alpha, bp[2] );
		VAL_MUL( _b3, alpha, bp[3] );
		VAL_MUL( _b4, alpha, bp[4] );
		VAL_MUL( _b5, alpha, bp[5] );
		VAL_MUL( _b6, alpha, bp[6] );
		VAL_MUL( _b7, alpha, bp[7] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			const oski_value_t* xp = x + j0;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1] );
			VAL_ASSIGN( _x2, xp[2] );
			VAL_ASSIGN( _x3, xp[3] );
			VAL_ASSIGN( _x4, xp[4] );
			VAL_ASSIGN( _x5, xp[5] );
			VAL_ASSIGN( _x6, xp[6] );

			VAL_MSUB_CONJ( _b0, vp[0], _x0 );
			VAL_MSUB_CONJ( _b1, vp[7], _x0 );
			VAL_MSUB_CONJ( _b2, vp[14], _x0 );
			VAL_MSUB_CONJ( _b3, vp[21], _x0 );
			VAL_MSUB_CONJ( _b4, vp[28], _x0 );
			VAL_MSUB_CONJ( _b5, vp[35], _x0 );
			VAL_MSUB_CONJ( _b6, vp[42], _x0 );
			VAL_MSUB_CONJ( _b7, vp[49], _x0 );
			VAL_MSUB_CONJ( _b0, vp[1], _x1 );
			VAL_MSUB_CONJ( _b1, vp[8], _x1 );
			VAL_MSUB_CONJ( _b2, vp[15], _x1 );
			VAL_MSUB_CONJ( _b3, vp[22], _x1 );
			VAL_MSUB_CONJ( _b4, vp[29], _x1 );
			VAL_MSUB_CONJ( _b5, vp[36], _x1 );
			VAL_MSUB_CONJ( _b6, vp[43], _x1 );
			VAL_MSUB_CONJ( _b7, vp[50], _x1 );
			VAL_MSUB_CONJ( _b0, vp[2], _x2 );
			VAL_MSUB_CONJ( _b1, vp[9], _x2 );
			VAL_MSUB_CONJ( _b2, vp[16], _x2 );
			VAL_MSUB_CONJ( _b3, vp[23], _x2 );
			VAL_MSUB_CONJ( _b4, vp[30], _x2 );
			VAL_MSUB_CONJ( _b5, vp[37], _x2 );
			VAL_MSUB_CONJ( _b6, vp[44], _x2 );
			VAL_MSUB_CONJ( _b7, vp[51], _x2 );
			VAL_MSUB_CONJ( _b0, vp[3], _x3 );
			VAL_MSUB_CONJ( _b1, vp[10], _x3 );
			VAL_MSUB_CONJ( _b2, vp[17], _x3 );
			VAL_MSUB_CONJ( _b3, vp[24], _x3 );
			VAL_MSUB_CONJ( _b4, vp[31], _x3 );
			VAL_MSUB_CONJ( _b5, vp[38], _x3 );
			VAL_MSUB_CONJ( _b6, vp[45], _x3 );
			VAL_MSUB_CONJ( _b7, vp[52], _x3 );
			VAL_MSUB_CONJ( _b0, vp[4], _x4 );
			VAL_MSUB_CONJ( _b1, vp[11], _x4 );
			VAL_MSUB_CONJ( _b2, vp[18], _x4 );
			VAL_MSUB_CONJ( _b3, vp[25], _x4 );
			VAL_MSUB_CONJ( _b4, vp[32], _x4 );
			VAL_MSUB_CONJ( _b5, vp[39], _x4 );
			VAL_MSUB_CONJ( _b6, vp[46], _x4 );
			VAL_MSUB_CONJ( _b7, vp[53], _x4 );
			VAL_MSUB_CONJ( _b0, vp[5], _x5 );
			VAL_MSUB_CONJ( _b1, vp[12], _x5 );
			VAL_MSUB_CONJ( _b2, vp[19], _x5 );
			VAL_MSUB_CONJ( _b3, vp[26], _x5 );
			VAL_MSUB_CONJ( _b4, vp[33], _x5 );
			VAL_MSUB_CONJ( _b5, vp[40], _x5 );
			VAL_MSUB_CONJ( _b6, vp[47], _x5 );
			VAL_MSUB_CONJ( _b7, vp[54], _x5 );
			VAL_MSUB_CONJ( _b0, vp[6], _x6 );
			VAL_MSUB_CONJ( _b1, vp[13], _x6 );
			VAL_MSUB_CONJ( _b2, vp[20], _x6 );
			VAL_MSUB_CONJ( _b3, vp[27], _x6 );
			VAL_MSUB_CONJ( _b4, vp[34], _x6 );
			VAL_MSUB_CONJ( _b5, vp[41], _x6 );
			VAL_MSUB_CONJ( _b6, vp[48], _x6 );
			VAL_MSUB_CONJ( _b7, vp[55], _x6 );
		} /* K */

		VAL_DIVEQ_CONJ( _b7, dp[7*8+7] );
		VAL_MSUB_CONJ( _b0, dp[0*8+7], _b7 );
		VAL_MSUB_CONJ( _b1, dp[1*8+7], _b7 );
		VAL_MSUB_CONJ( _b2, dp[2*8+7], _b7 );
		VAL_MSUB_CONJ( _b3, dp[3*8+7], _b7 );
		VAL_MSUB_CONJ( _b4, dp[4*8+7], _b7 );
		VAL_MSUB_CONJ( _b5, dp[5*8+7], _b7 );
		VAL_MSUB_CONJ( _b6, dp[6*8+7], _b7 );
		VAL_DIVEQ_CONJ( _b6, dp[6*8+6] );
		VAL_MSUB_CONJ( _b0, dp[0*8+6], _b6 );
		VAL_MSUB_CONJ( _b1, dp[1*8+6], _b6 );
		VAL_MSUB_CONJ( _b2, dp[2*8+6], _b6 );
		VAL_MSUB_CONJ( _b3, dp[3*8+6], _b6 );
		VAL_MSUB_CONJ( _b4, dp[4*8+6], _b6 );
		VAL_MSUB_CONJ( _b5, dp[5*8+6], _b6 );
		VAL_DIVEQ_CONJ( _b5, dp[5*8+5] );
		VAL_MSUB_CONJ( _b0, dp[0*8+5], _b5 );
		VAL_MSUB_CONJ( _b1, dp[1*8+5], _b5 );
		VAL_MSUB_CONJ( _b2, dp[2*8+5], _b5 );
		VAL_MSUB_CONJ( _b3, dp[3*8+5], _b5 );
		VAL_MSUB_CONJ( _b4, dp[4*8+5], _b5 );
		VAL_DIVEQ_CONJ( _b4, dp[4*8+4] );
		VAL_MSUB_CONJ( _b0, dp[0*8+4], _b4 );
		VAL_MSUB_CONJ( _b1, dp[1*8+4], _b4 );
		VAL_MSUB_CONJ( _b2, dp[2*8+4], _b4 );
		VAL_MSUB_CONJ( _b3, dp[3*8+4], _b4 );
		VAL_DIVEQ_CONJ( _b3, dp[3*8+3] );
		VAL_MSUB_CONJ( _b0, dp[0*8+3], _b3 );
		VAL_MSUB_CONJ( _b1, dp[1*8+3], _b3 );
		VAL_MSUB_CONJ( _b2, dp[2*8+3], _b3 );
		VAL_DIVEQ_CONJ( _b2, dp[2*8+2] );
		VAL_MSUB_CONJ( _b0, dp[0*8+2], _b2 );
		VAL_MSUB_CONJ( _b1, dp[1*8+2], _b2 );
		VAL_DIVEQ_CONJ( _b1, dp[1*8+1] );
		VAL_MSUB_CONJ( _b0, dp[0*8+1], _b1 );
		VAL_DIVEQ_CONJ( _b0, dp[0*8+0] );
		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1], _b1 );
		VAL_ASSIGN( bp[2], _b2 );
		VAL_ASSIGN( bp[3], _b3 );
		VAL_ASSIGN( bp[4], _b4 );
		VAL_ASSIGN( bp[5], _b5 );
		VAL_ASSIGN( bp[6], _b6 );
		VAL_ASSIGN( bp[7], _b7 );
	} /* I */
} /* MBCSR_MatConjTrisolve_Upper_v1_aX_xs1 */

#endif /* IS_VAL_COMPLEX */
#if !IS_VAL_COMPLEX
	/** Synonym for pure real version */
	#define MBCSR_MatConjTrisolve_Upper_v1_aX_xsX MBCSR_MatTrisolve_Upper_v1_aX_xsX
#else /* IS_VAL_COMPLEX */

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatConjTrisolve_Upper_v1_aX_xsX. */
	#define MBCSR_MatConjTrisolve_Upper_v1_aX_xsX \
		MANGLE_MOD_(MBCSR_MatConjTrisolve_Upper_v1_aX_xsX_8x7)
#endif

#endif
#if IS_VAL_COMPLEX
/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$\bar{U}^{-1}\cdot b\f$, where x has general-stride.
 */
void
MBCSR_MatConjTrisolve_Upper_v1_aX_xsX(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x, oski_index_t incx )
{
	oski_index_t I;
	oski_value_t* bp = x + (d0 + (M-1)*8)*incx;
	const oski_value_t* dp = diag + (M-1)*8*8;

	for( I = M; I != 0; I--, bp -= 8*incx, dp -= 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_MUL( _b0, alpha, bp[0] );
		VAL_MUL( _b1, alpha, bp[1*incx] );
		VAL_MUL( _b2, alpha, bp[2*incx] );
		VAL_MUL( _b3, alpha, bp[3*incx] );
		VAL_MUL( _b4, alpha, bp[4*incx] );
		VAL_MUL( _b5, alpha, bp[5*incx] );
		VAL_MUL( _b6, alpha, bp[6*incx] );
		VAL_MUL( _b7, alpha, bp[7*incx] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			const oski_value_t* xp = x + j0*incx;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1*incx] );
			VAL_ASSIGN( _x2, xp[2*incx] );
			VAL_ASSIGN( _x3, xp[3*incx] );
			VAL_ASSIGN( _x4, xp[4*incx] );
			VAL_ASSIGN( _x5, xp[5*incx] );
			VAL_ASSIGN( _x6, xp[6*incx] );

			VAL_MSUB_CONJ( _b0, vp[0], _x0 );
			VAL_MSUB_CONJ( _b1, vp[7], _x0 );
			VAL_MSUB_CONJ( _b2, vp[14], _x0 );
			VAL_MSUB_CONJ( _b3, vp[21], _x0 );
			VAL_MSUB_CONJ( _b4, vp[28], _x0 );
			VAL_MSUB_CONJ( _b5, vp[35], _x0 );
			VAL_MSUB_CONJ( _b6, vp[42], _x0 );
			VAL_MSUB_CONJ( _b7, vp[49], _x0 );
			VAL_MSUB_CONJ( _b0, vp[1], _x1 );
			VAL_MSUB_CONJ( _b1, vp[8], _x1 );
			VAL_MSUB_CONJ( _b2, vp[15], _x1 );
			VAL_MSUB_CONJ( _b3, vp[22], _x1 );
			VAL_MSUB_CONJ( _b4, vp[29], _x1 );
			VAL_MSUB_CONJ( _b5, vp[36], _x1 );
			VAL_MSUB_CONJ( _b6, vp[43], _x1 );
			VAL_MSUB_CONJ( _b7, vp[50], _x1 );
			VAL_MSUB_CONJ( _b0, vp[2], _x2 );
			VAL_MSUB_CONJ( _b1, vp[9], _x2 );
			VAL_MSUB_CONJ( _b2, vp[16], _x2 );
			VAL_MSUB_CONJ( _b3, vp[23], _x2 );
			VAL_MSUB_CONJ( _b4, vp[30], _x2 );
			VAL_MSUB_CONJ( _b5, vp[37], _x2 );
			VAL_MSUB_CONJ( _b6, vp[44], _x2 );
			VAL_MSUB_CONJ( _b7, vp[51], _x2 );
			VAL_MSUB_CONJ( _b0, vp[3], _x3 );
			VAL_MSUB_CONJ( _b1, vp[10], _x3 );
			VAL_MSUB_CONJ( _b2, vp[17], _x3 );
			VAL_MSUB_CONJ( _b3, vp[24], _x3 );
			VAL_MSUB_CONJ( _b4, vp[31], _x3 );
			VAL_MSUB_CONJ( _b5, vp[38], _x3 );
			VAL_MSUB_CONJ( _b6, vp[45], _x3 );
			VAL_MSUB_CONJ( _b7, vp[52], _x3 );
			VAL_MSUB_CONJ( _b0, vp[4], _x4 );
			VAL_MSUB_CONJ( _b1, vp[11], _x4 );
			VAL_MSUB_CONJ( _b2, vp[18], _x4 );
			VAL_MSUB_CONJ( _b3, vp[25], _x4 );
			VAL_MSUB_CONJ( _b4, vp[32], _x4 );
			VAL_MSUB_CONJ( _b5, vp[39], _x4 );
			VAL_MSUB_CONJ( _b6, vp[46], _x4 );
			VAL_MSUB_CONJ( _b7, vp[53], _x4 );
			VAL_MSUB_CONJ( _b0, vp[5], _x5 );
			VAL_MSUB_CONJ( _b1, vp[12], _x5 );
			VAL_MSUB_CONJ( _b2, vp[19], _x5 );
			VAL_MSUB_CONJ( _b3, vp[26], _x5 );
			VAL_MSUB_CONJ( _b4, vp[33], _x5 );
			VAL_MSUB_CONJ( _b5, vp[40], _x5 );
			VAL_MSUB_CONJ( _b6, vp[47], _x5 );
			VAL_MSUB_CONJ( _b7, vp[54], _x5 );
			VAL_MSUB_CONJ( _b0, vp[6], _x6 );
			VAL_MSUB_CONJ( _b1, vp[13], _x6 );
			VAL_MSUB_CONJ( _b2, vp[20], _x6 );
			VAL_MSUB_CONJ( _b3, vp[27], _x6 );
			VAL_MSUB_CONJ( _b4, vp[34], _x6 );
			VAL_MSUB_CONJ( _b5, vp[41], _x6 );
			VAL_MSUB_CONJ( _b6, vp[48], _x6 );
			VAL_MSUB_CONJ( _b7, vp[55], _x6 );
		} /* K */

		VAL_DIVEQ_CONJ( _b7, dp[7*8+7] );
		VAL_MSUB_CONJ( _b0, dp[0*8+7], _b7 );
		VAL_MSUB_CONJ( _b1, dp[1*8+7], _b7 );
		VAL_MSUB_CONJ( _b2, dp[2*8+7], _b7 );
		VAL_MSUB_CONJ( _b3, dp[3*8+7], _b7 );
		VAL_MSUB_CONJ( _b4, dp[4*8+7], _b7 );
		VAL_MSUB_CONJ( _b5, dp[5*8+7], _b7 );
		VAL_MSUB_CONJ( _b6, dp[6*8+7], _b7 );
		VAL_DIVEQ_CONJ( _b6, dp[6*8+6] );
		VAL_MSUB_CONJ( _b0, dp[0*8+6], _b6 );
		VAL_MSUB_CONJ( _b1, dp[1*8+6], _b6 );
		VAL_MSUB_CONJ( _b2, dp[2*8+6], _b6 );
		VAL_MSUB_CONJ( _b3, dp[3*8+6], _b6 );
		VAL_MSUB_CONJ( _b4, dp[4*8+6], _b6 );
		VAL_MSUB_CONJ( _b5, dp[5*8+6], _b6 );
		VAL_DIVEQ_CONJ( _b5, dp[5*8+5] );
		VAL_MSUB_CONJ( _b0, dp[0*8+5], _b5 );
		VAL_MSUB_CONJ( _b1, dp[1*8+5], _b5 );
		VAL_MSUB_CONJ( _b2, dp[2*8+5], _b5 );
		VAL_MSUB_CONJ( _b3, dp[3*8+5], _b5 );
		VAL_MSUB_CONJ( _b4, dp[4*8+5], _b5 );
		VAL_DIVEQ_CONJ( _b4, dp[4*8+4] );
		VAL_MSUB_CONJ( _b0, dp[0*8+4], _b4 );
		VAL_MSUB_CONJ( _b1, dp[1*8+4], _b4 );
		VAL_MSUB_CONJ( _b2, dp[2*8+4], _b4 );
		VAL_MSUB_CONJ( _b3, dp[3*8+4], _b4 );
		VAL_DIVEQ_CONJ( _b3, dp[3*8+3] );
		VAL_MSUB_CONJ( _b0, dp[0*8+3], _b3 );
		VAL_MSUB_CONJ( _b1, dp[1*8+3], _b3 );
		VAL_MSUB_CONJ( _b2, dp[2*8+3], _b3 );
		VAL_DIVEQ_CONJ( _b2, dp[2*8+2] );
		VAL_MSUB_CONJ( _b0, dp[0*8+2], _b2 );
		VAL_MSUB_CONJ( _b1, dp[1*8+2], _b2 );
		VAL_DIVEQ_CONJ( _b1, dp[1*8+1] );
		VAL_MSUB_CONJ( _b0, dp[0*8+1], _b1 );
		VAL_DIVEQ_CONJ( _b0, dp[0*8+0] );
		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1*incx], _b1 );
		VAL_ASSIGN( bp[2*incx], _b2 );
		VAL_ASSIGN( bp[3*incx], _b3 );
		VAL_ASSIGN( bp[4*incx], _b4 );
		VAL_ASSIGN( bp[5*incx], _b5 );
		VAL_ASSIGN( bp[6*incx], _b6 );
		VAL_ASSIGN( bp[7*incx], _b7 );
	} /* I */
} /* MBCSR_MatConjTrisolve_Upper_v1_aX_xsX */

#endif /* IS_VAL_COMPLEX */
/**
 *  \brief Exported module wrapper for the \f$8\times 7\f$
 *  implementation of the sparse triangular solve operation,
 *  where the matrix is upper triangular.
 */
static void
MatConjTrisolve_Upper( const oski_submatMBCSR_t* T,
	oski_value_t alpha, oski_vecview_t x )
{
	assert( T != NULL );
	assert( T->r == 8 );
	assert( T->c == 7 );
	assert( x != INVALID_VEC );

	if( x->rowinc == 1 ) {
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatConjTrisolve_Upper_v1_aX_xs1( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp );
	} else { /* x has general (non-unit) stride */
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatConjTrisolve_Upper_v1_aX_xsX( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp, x->rowinc );
	}
}

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatTransTrisolve_Upper_v1_aX_xs1. */
	#define MBCSR_MatTransTrisolve_Upper_v1_aX_xs1 \
		MANGLE_MOD_(MBCSR_MatTransTrisolve_Upper_v1_aX_xs1_8x7)
#endif

/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$U^{-T}\cdot b\f$, where x has unit-stride.
 */
void
MBCSR_MatTransTrisolve_Upper_v1_aX_xs1(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x )
{
	oski_index_t I;
	oski_value_t* bp = x + d0;
	const oski_value_t* dp = diag;

	{
		oski_index_t m = M * 8;
		oski_index_t incx = 1;
		BLAS_xSCAL( &m, &alpha, x, &incx );
	}

	for( I = 1; I != M+1; I++, bp += 8, dp += 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_ASSIGN( _b0, bp[0] );
		VAL_ASSIGN( _b1, bp[1] );
		VAL_ASSIGN( _b2, bp[2] );
		VAL_ASSIGN( _b3, bp[3] );
		VAL_ASSIGN( _b4, bp[4] );
		VAL_ASSIGN( _b5, bp[5] );
		VAL_ASSIGN( _b6, bp[6] );
		VAL_ASSIGN( _b7, bp[7] );

		VAL_DIVEQ( _b0, dp[0*8+0] );
		VAL_MSUB( _b1, dp[0*8+1], _b0 );
		VAL_MSUB( _b2, dp[0*8+2], _b0 );
		VAL_MSUB( _b3, dp[0*8+3], _b0 );
		VAL_MSUB( _b4, dp[0*8+4], _b0 );
		VAL_MSUB( _b5, dp[0*8+5], _b0 );
		VAL_MSUB( _b6, dp[0*8+6], _b0 );
		VAL_MSUB( _b7, dp[0*8+7], _b0 );
		VAL_DIVEQ( _b1, dp[1*8+1] );
		VAL_MSUB( _b2, dp[1*8+2], _b1 );
		VAL_MSUB( _b3, dp[1*8+3], _b1 );
		VAL_MSUB( _b4, dp[1*8+4], _b1 );
		VAL_MSUB( _b5, dp[1*8+5], _b1 );
		VAL_MSUB( _b6, dp[1*8+6], _b1 );
		VAL_MSUB( _b7, dp[1*8+7], _b1 );
		VAL_DIVEQ( _b2, dp[2*8+2] );
		VAL_MSUB( _b3, dp[2*8+3], _b2 );
		VAL_MSUB( _b4, dp[2*8+4], _b2 );
		VAL_MSUB( _b5, dp[2*8+5], _b2 );
		VAL_MSUB( _b6, dp[2*8+6], _b2 );
		VAL_MSUB( _b7, dp[2*8+7], _b2 );
		VAL_DIVEQ( _b3, dp[3*8+3] );
		VAL_MSUB( _b4, dp[3*8+4], _b3 );
		VAL_MSUB( _b5, dp[3*8+5], _b3 );
		VAL_MSUB( _b6, dp[3*8+6], _b3 );
		VAL_MSUB( _b7, dp[3*8+7], _b3 );
		VAL_DIVEQ( _b4, dp[4*8+4] );
		VAL_MSUB( _b5, dp[4*8+5], _b4 );
		VAL_MSUB( _b6, dp[4*8+6], _b4 );
		VAL_MSUB( _b7, dp[4*8+7], _b4 );
		VAL_DIVEQ( _b5, dp[5*8+5] );
		VAL_MSUB( _b6, dp[5*8+6], _b5 );
		VAL_MSUB( _b7, dp[5*8+7], _b5 );
		VAL_DIVEQ( _b6, dp[6*8+6] );
		VAL_MSUB( _b7, dp[6*8+7], _b6 );
		VAL_DIVEQ( _b7, dp[7*8+7] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			oski_value_t* xp = x + j0;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1] );
			VAL_ASSIGN( _x2, xp[2] );
			VAL_ASSIGN( _x3, xp[3] );
			VAL_ASSIGN( _x4, xp[4] );
			VAL_ASSIGN( _x5, xp[5] );
			VAL_ASSIGN( _x6, xp[6] );

			VAL_MSUB( _x0, vp[0], _b0 );
			VAL_MSUB( _x1, vp[1], _b0 );
			VAL_MSUB( _x2, vp[2], _b0 );
			VAL_MSUB( _x3, vp[3], _b0 );
			VAL_MSUB( _x4, vp[4], _b0 );
			VAL_MSUB( _x5, vp[5], _b0 );
			VAL_MSUB( _x6, vp[6], _b0 );
			VAL_MSUB( _x0, vp[7], _b1 );
			VAL_MSUB( _x1, vp[8], _b1 );
			VAL_MSUB( _x2, vp[9], _b1 );
			VAL_MSUB( _x3, vp[10], _b1 );
			VAL_MSUB( _x4, vp[11], _b1 );
			VAL_MSUB( _x5, vp[12], _b1 );
			VAL_MSUB( _x6, vp[13], _b1 );
			VAL_MSUB( _x0, vp[14], _b2 );
			VAL_MSUB( _x1, vp[15], _b2 );
			VAL_MSUB( _x2, vp[16], _b2 );
			VAL_MSUB( _x3, vp[17], _b2 );
			VAL_MSUB( _x4, vp[18], _b2 );
			VAL_MSUB( _x5, vp[19], _b2 );
			VAL_MSUB( _x6, vp[20], _b2 );
			VAL_MSUB( _x0, vp[21], _b3 );
			VAL_MSUB( _x1, vp[22], _b3 );
			VAL_MSUB( _x2, vp[23], _b3 );
			VAL_MSUB( _x3, vp[24], _b3 );
			VAL_MSUB( _x4, vp[25], _b3 );
			VAL_MSUB( _x5, vp[26], _b3 );
			VAL_MSUB( _x6, vp[27], _b3 );
			VAL_MSUB( _x0, vp[28], _b4 );
			VAL_MSUB( _x1, vp[29], _b4 );
			VAL_MSUB( _x2, vp[30], _b4 );
			VAL_MSUB( _x3, vp[31], _b4 );
			VAL_MSUB( _x4, vp[32], _b4 );
			VAL_MSUB( _x5, vp[33], _b4 );
			VAL_MSUB( _x6, vp[34], _b4 );
			VAL_MSUB( _x0, vp[35], _b5 );
			VAL_MSUB( _x1, vp[36], _b5 );
			VAL_MSUB( _x2, vp[37], _b5 );
			VAL_MSUB( _x3, vp[38], _b5 );
			VAL_MSUB( _x4, vp[39], _b5 );
			VAL_MSUB( _x5, vp[40], _b5 );
			VAL_MSUB( _x6, vp[41], _b5 );
			VAL_MSUB( _x0, vp[42], _b6 );
			VAL_MSUB( _x1, vp[43], _b6 );
			VAL_MSUB( _x2, vp[44], _b6 );
			VAL_MSUB( _x3, vp[45], _b6 );
			VAL_MSUB( _x4, vp[46], _b6 );
			VAL_MSUB( _x5, vp[47], _b6 );
			VAL_MSUB( _x6, vp[48], _b6 );
			VAL_MSUB( _x0, vp[49], _b7 );
			VAL_MSUB( _x1, vp[50], _b7 );
			VAL_MSUB( _x2, vp[51], _b7 );
			VAL_MSUB( _x3, vp[52], _b7 );
			VAL_MSUB( _x4, vp[53], _b7 );
			VAL_MSUB( _x5, vp[54], _b7 );
			VAL_MSUB( _x6, vp[55], _b7 );
			VAL_ASSIGN( xp[0], _x0 );
			VAL_ASSIGN( xp[1], _x1 );
			VAL_ASSIGN( xp[2], _x2 );
			VAL_ASSIGN( xp[3], _x3 );
			VAL_ASSIGN( xp[4], _x4 );
			VAL_ASSIGN( xp[5], _x5 );
			VAL_ASSIGN( xp[6], _x6 );
		} /* K */

		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1], _b1 );
		VAL_ASSIGN( bp[2], _b2 );
		VAL_ASSIGN( bp[3], _b3 );
		VAL_ASSIGN( bp[4], _b4 );
		VAL_ASSIGN( bp[5], _b5 );
		VAL_ASSIGN( bp[6], _b6 );
		VAL_ASSIGN( bp[7], _b7 );
	} /* I */
} /* MBCSR_MatTransTrisolve_Upper_v1_aX_xs1 */

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatTransTrisolve_Upper_v1_aX_xsX. */
	#define MBCSR_MatTransTrisolve_Upper_v1_aX_xsX \
		MANGLE_MOD_(MBCSR_MatTransTrisolve_Upper_v1_aX_xsX_8x7)
#endif

/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$U^{-T}\cdot b\f$, where x has general-stride.
 */
void
MBCSR_MatTransTrisolve_Upper_v1_aX_xsX(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x, oski_index_t incx )
{
	oski_index_t I;
	oski_value_t* bp = x + d0*incx;
	const oski_value_t* dp = diag;

	{
		oski_index_t m = M * 8;
		BLAS_xSCAL( &m, &alpha, x, &incx );
	}

	for( I = 1; I != M+1; I++, bp += 8*incx, dp += 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_ASSIGN( _b0, bp[0] );
		VAL_ASSIGN( _b1, bp[1*incx] );
		VAL_ASSIGN( _b2, bp[2*incx] );
		VAL_ASSIGN( _b3, bp[3*incx] );
		VAL_ASSIGN( _b4, bp[4*incx] );
		VAL_ASSIGN( _b5, bp[5*incx] );
		VAL_ASSIGN( _b6, bp[6*incx] );
		VAL_ASSIGN( _b7, bp[7*incx] );

		VAL_DIVEQ( _b0, dp[0*8+0] );
		VAL_MSUB( _b1, dp[0*8+1], _b0 );
		VAL_MSUB( _b2, dp[0*8+2], _b0 );
		VAL_MSUB( _b3, dp[0*8+3], _b0 );
		VAL_MSUB( _b4, dp[0*8+4], _b0 );
		VAL_MSUB( _b5, dp[0*8+5], _b0 );
		VAL_MSUB( _b6, dp[0*8+6], _b0 );
		VAL_MSUB( _b7, dp[0*8+7], _b0 );
		VAL_DIVEQ( _b1, dp[1*8+1] );
		VAL_MSUB( _b2, dp[1*8+2], _b1 );
		VAL_MSUB( _b3, dp[1*8+3], _b1 );
		VAL_MSUB( _b4, dp[1*8+4], _b1 );
		VAL_MSUB( _b5, dp[1*8+5], _b1 );
		VAL_MSUB( _b6, dp[1*8+6], _b1 );
		VAL_MSUB( _b7, dp[1*8+7], _b1 );
		VAL_DIVEQ( _b2, dp[2*8+2] );
		VAL_MSUB( _b3, dp[2*8+3], _b2 );
		VAL_MSUB( _b4, dp[2*8+4], _b2 );
		VAL_MSUB( _b5, dp[2*8+5], _b2 );
		VAL_MSUB( _b6, dp[2*8+6], _b2 );
		VAL_MSUB( _b7, dp[2*8+7], _b2 );
		VAL_DIVEQ( _b3, dp[3*8+3] );
		VAL_MSUB( _b4, dp[3*8+4], _b3 );
		VAL_MSUB( _b5, dp[3*8+5], _b3 );
		VAL_MSUB( _b6, dp[3*8+6], _b3 );
		VAL_MSUB( _b7, dp[3*8+7], _b3 );
		VAL_DIVEQ( _b4, dp[4*8+4] );
		VAL_MSUB( _b5, dp[4*8+5], _b4 );
		VAL_MSUB( _b6, dp[4*8+6], _b4 );
		VAL_MSUB( _b7, dp[4*8+7], _b4 );
		VAL_DIVEQ( _b5, dp[5*8+5] );
		VAL_MSUB( _b6, dp[5*8+6], _b5 );
		VAL_MSUB( _b7, dp[5*8+7], _b5 );
		VAL_DIVEQ( _b6, dp[6*8+6] );
		VAL_MSUB( _b7, dp[6*8+7], _b6 );
		VAL_DIVEQ( _b7, dp[7*8+7] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			oski_value_t* xp = x + j0*incx;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1*incx] );
			VAL_ASSIGN( _x2, xp[2*incx] );
			VAL_ASSIGN( _x3, xp[3*incx] );
			VAL_ASSIGN( _x4, xp[4*incx] );
			VAL_ASSIGN( _x5, xp[5*incx] );
			VAL_ASSIGN( _x6, xp[6*incx] );

			VAL_MSUB( _x0, vp[0], _b0 );
			VAL_MSUB( _x1, vp[1], _b0 );
			VAL_MSUB( _x2, vp[2], _b0 );
			VAL_MSUB( _x3, vp[3], _b0 );
			VAL_MSUB( _x4, vp[4], _b0 );
			VAL_MSUB( _x5, vp[5], _b0 );
			VAL_MSUB( _x6, vp[6], _b0 );
			VAL_MSUB( _x0, vp[7], _b1 );
			VAL_MSUB( _x1, vp[8], _b1 );
			VAL_MSUB( _x2, vp[9], _b1 );
			VAL_MSUB( _x3, vp[10], _b1 );
			VAL_MSUB( _x4, vp[11], _b1 );
			VAL_MSUB( _x5, vp[12], _b1 );
			VAL_MSUB( _x6, vp[13], _b1 );
			VAL_MSUB( _x0, vp[14], _b2 );
			VAL_MSUB( _x1, vp[15], _b2 );
			VAL_MSUB( _x2, vp[16], _b2 );
			VAL_MSUB( _x3, vp[17], _b2 );
			VAL_MSUB( _x4, vp[18], _b2 );
			VAL_MSUB( _x5, vp[19], _b2 );
			VAL_MSUB( _x6, vp[20], _b2 );
			VAL_MSUB( _x0, vp[21], _b3 );
			VAL_MSUB( _x1, vp[22], _b3 );
			VAL_MSUB( _x2, vp[23], _b3 );
			VAL_MSUB( _x3, vp[24], _b3 );
			VAL_MSUB( _x4, vp[25], _b3 );
			VAL_MSUB( _x5, vp[26], _b3 );
			VAL_MSUB( _x6, vp[27], _b3 );
			VAL_MSUB( _x0, vp[28], _b4 );
			VAL_MSUB( _x1, vp[29], _b4 );
			VAL_MSUB( _x2, vp[30], _b4 );
			VAL_MSUB( _x3, vp[31], _b4 );
			VAL_MSUB( _x4, vp[32], _b4 );
			VAL_MSUB( _x5, vp[33], _b4 );
			VAL_MSUB( _x6, vp[34], _b4 );
			VAL_MSUB( _x0, vp[35], _b5 );
			VAL_MSUB( _x1, vp[36], _b5 );
			VAL_MSUB( _x2, vp[37], _b5 );
			VAL_MSUB( _x3, vp[38], _b5 );
			VAL_MSUB( _x4, vp[39], _b5 );
			VAL_MSUB( _x5, vp[40], _b5 );
			VAL_MSUB( _x6, vp[41], _b5 );
			VAL_MSUB( _x0, vp[42], _b6 );
			VAL_MSUB( _x1, vp[43], _b6 );
			VAL_MSUB( _x2, vp[44], _b6 );
			VAL_MSUB( _x3, vp[45], _b6 );
			VAL_MSUB( _x4, vp[46], _b6 );
			VAL_MSUB( _x5, vp[47], _b6 );
			VAL_MSUB( _x6, vp[48], _b6 );
			VAL_MSUB( _x0, vp[49], _b7 );
			VAL_MSUB( _x1, vp[50], _b7 );
			VAL_MSUB( _x2, vp[51], _b7 );
			VAL_MSUB( _x3, vp[52], _b7 );
			VAL_MSUB( _x4, vp[53], _b7 );
			VAL_MSUB( _x5, vp[54], _b7 );
			VAL_MSUB( _x6, vp[55], _b7 );
			VAL_ASSIGN( xp[0], _x0 );
			VAL_ASSIGN( xp[1*incx], _x1 );
			VAL_ASSIGN( xp[2*incx], _x2 );
			VAL_ASSIGN( xp[3*incx], _x3 );
			VAL_ASSIGN( xp[4*incx], _x4 );
			VAL_ASSIGN( xp[5*incx], _x5 );
			VAL_ASSIGN( xp[6*incx], _x6 );
		} /* K */

		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1*incx], _b1 );
		VAL_ASSIGN( bp[2*incx], _b2 );
		VAL_ASSIGN( bp[3*incx], _b3 );
		VAL_ASSIGN( bp[4*incx], _b4 );
		VAL_ASSIGN( bp[5*incx], _b5 );
		VAL_ASSIGN( bp[6*incx], _b6 );
		VAL_ASSIGN( bp[7*incx], _b7 );
	} /* I */
} /* MBCSR_MatTransTrisolve_Upper_v1_aX_xsX */

/**
 *  \brief Exported module wrapper for the \f$8\times 7\f$
 *  implementation of the sparse triangular solve operation,
 *  where the matrix is upper triangular.
 */
static void
MatTransTrisolve_Upper( const oski_submatMBCSR_t* T,
	oski_value_t alpha, oski_vecview_t x )
{
	assert( T != NULL );
	assert( T->r == 8 );
	assert( T->c == 7 );
	assert( x != INVALID_VEC );

	if( x->rowinc == 1 ) {
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatTransTrisolve_Upper_v1_aX_xs1( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp );
	} else { /* x has general (non-unit) stride */
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatTransTrisolve_Upper_v1_aX_xsX( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp, x->rowinc );
	}
}

#if !IS_VAL_COMPLEX
	/** Synonym for pure real version */
	#define MBCSR_MatHermTrisolve_Upper_v1_aX_xs1 MBCSR_MatTransTrisolve_Upper_v1_aX_xs1
#else /* IS_VAL_COMPLEX */

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatHermTrisolve_Upper_v1_aX_xs1. */
	#define MBCSR_MatHermTrisolve_Upper_v1_aX_xs1 \
		MANGLE_MOD_(MBCSR_MatHermTrisolve_Upper_v1_aX_xs1_8x7)
#endif

#endif
#if IS_VAL_COMPLEX
/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$\bar{U}^{-T}\cdot b\f$, where x has unit-stride.
 */
void
MBCSR_MatHermTrisolve_Upper_v1_aX_xs1(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x )
{
	oski_index_t I;
	oski_value_t* bp = x + d0;
	const oski_value_t* dp = diag;

	{
		oski_index_t m = M * 8;
		oski_index_t incx = 1;
		BLAS_xSCAL( &m, &alpha, x, &incx );
	}

	for( I = 1; I != M+1; I++, bp += 8, dp += 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_ASSIGN( _b0, bp[0] );
		VAL_ASSIGN( _b1, bp[1] );
		VAL_ASSIGN( _b2, bp[2] );
		VAL_ASSIGN( _b3, bp[3] );
		VAL_ASSIGN( _b4, bp[4] );
		VAL_ASSIGN( _b5, bp[5] );
		VAL_ASSIGN( _b6, bp[6] );
		VAL_ASSIGN( _b7, bp[7] );

		VAL_DIVEQ_CONJ( _b0, dp[0*8+0] );
		VAL_MSUB_CONJ( _b1, dp[0*8+1], _b0 );
		VAL_MSUB_CONJ( _b2, dp[0*8+2], _b0 );
		VAL_MSUB_CONJ( _b3, dp[0*8+3], _b0 );
		VAL_MSUB_CONJ( _b4, dp[0*8+4], _b0 );
		VAL_MSUB_CONJ( _b5, dp[0*8+5], _b0 );
		VAL_MSUB_CONJ( _b6, dp[0*8+6], _b0 );
		VAL_MSUB_CONJ( _b7, dp[0*8+7], _b0 );
		VAL_DIVEQ_CONJ( _b1, dp[1*8+1] );
		VAL_MSUB_CONJ( _b2, dp[1*8+2], _b1 );
		VAL_MSUB_CONJ( _b3, dp[1*8+3], _b1 );
		VAL_MSUB_CONJ( _b4, dp[1*8+4], _b1 );
		VAL_MSUB_CONJ( _b5, dp[1*8+5], _b1 );
		VAL_MSUB_CONJ( _b6, dp[1*8+6], _b1 );
		VAL_MSUB_CONJ( _b7, dp[1*8+7], _b1 );
		VAL_DIVEQ_CONJ( _b2, dp[2*8+2] );
		VAL_MSUB_CONJ( _b3, dp[2*8+3], _b2 );
		VAL_MSUB_CONJ( _b4, dp[2*8+4], _b2 );
		VAL_MSUB_CONJ( _b5, dp[2*8+5], _b2 );
		VAL_MSUB_CONJ( _b6, dp[2*8+6], _b2 );
		VAL_MSUB_CONJ( _b7, dp[2*8+7], _b2 );
		VAL_DIVEQ_CONJ( _b3, dp[3*8+3] );
		VAL_MSUB_CONJ( _b4, dp[3*8+4], _b3 );
		VAL_MSUB_CONJ( _b5, dp[3*8+5], _b3 );
		VAL_MSUB_CONJ( _b6, dp[3*8+6], _b3 );
		VAL_MSUB_CONJ( _b7, dp[3*8+7], _b3 );
		VAL_DIVEQ_CONJ( _b4, dp[4*8+4] );
		VAL_MSUB_CONJ( _b5, dp[4*8+5], _b4 );
		VAL_MSUB_CONJ( _b6, dp[4*8+6], _b4 );
		VAL_MSUB_CONJ( _b7, dp[4*8+7], _b4 );
		VAL_DIVEQ_CONJ( _b5, dp[5*8+5] );
		VAL_MSUB_CONJ( _b6, dp[5*8+6], _b5 );
		VAL_MSUB_CONJ( _b7, dp[5*8+7], _b5 );
		VAL_DIVEQ_CONJ( _b6, dp[6*8+6] );
		VAL_MSUB_CONJ( _b7, dp[6*8+7], _b6 );
		VAL_DIVEQ_CONJ( _b7, dp[7*8+7] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			oski_value_t* xp = x + j0;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1] );
			VAL_ASSIGN( _x2, xp[2] );
			VAL_ASSIGN( _x3, xp[3] );
			VAL_ASSIGN( _x4, xp[4] );
			VAL_ASSIGN( _x5, xp[5] );
			VAL_ASSIGN( _x6, xp[6] );

			VAL_MSUB_CONJ( _x0, vp[0], _b0 );
			VAL_MSUB_CONJ( _x1, vp[1], _b0 );
			VAL_MSUB_CONJ( _x2, vp[2], _b0 );
			VAL_MSUB_CONJ( _x3, vp[3], _b0 );
			VAL_MSUB_CONJ( _x4, vp[4], _b0 );
			VAL_MSUB_CONJ( _x5, vp[5], _b0 );
			VAL_MSUB_CONJ( _x6, vp[6], _b0 );
			VAL_MSUB_CONJ( _x0, vp[7], _b1 );
			VAL_MSUB_CONJ( _x1, vp[8], _b1 );
			VAL_MSUB_CONJ( _x2, vp[9], _b1 );
			VAL_MSUB_CONJ( _x3, vp[10], _b1 );
			VAL_MSUB_CONJ( _x4, vp[11], _b1 );
			VAL_MSUB_CONJ( _x5, vp[12], _b1 );
			VAL_MSUB_CONJ( _x6, vp[13], _b1 );
			VAL_MSUB_CONJ( _x0, vp[14], _b2 );
			VAL_MSUB_CONJ( _x1, vp[15], _b2 );
			VAL_MSUB_CONJ( _x2, vp[16], _b2 );
			VAL_MSUB_CONJ( _x3, vp[17], _b2 );
			VAL_MSUB_CONJ( _x4, vp[18], _b2 );
			VAL_MSUB_CONJ( _x5, vp[19], _b2 );
			VAL_MSUB_CONJ( _x6, vp[20], _b2 );
			VAL_MSUB_CONJ( _x0, vp[21], _b3 );
			VAL_MSUB_CONJ( _x1, vp[22], _b3 );
			VAL_MSUB_CONJ( _x2, vp[23], _b3 );
			VAL_MSUB_CONJ( _x3, vp[24], _b3 );
			VAL_MSUB_CONJ( _x4, vp[25], _b3 );
			VAL_MSUB_CONJ( _x5, vp[26], _b3 );
			VAL_MSUB_CONJ( _x6, vp[27], _b3 );
			VAL_MSUB_CONJ( _x0, vp[28], _b4 );
			VAL_MSUB_CONJ( _x1, vp[29], _b4 );
			VAL_MSUB_CONJ( _x2, vp[30], _b4 );
			VAL_MSUB_CONJ( _x3, vp[31], _b4 );
			VAL_MSUB_CONJ( _x4, vp[32], _b4 );
			VAL_MSUB_CONJ( _x5, vp[33], _b4 );
			VAL_MSUB_CONJ( _x6, vp[34], _b4 );
			VAL_MSUB_CONJ( _x0, vp[35], _b5 );
			VAL_MSUB_CONJ( _x1, vp[36], _b5 );
			VAL_MSUB_CONJ( _x2, vp[37], _b5 );
			VAL_MSUB_CONJ( _x3, vp[38], _b5 );
			VAL_MSUB_CONJ( _x4, vp[39], _b5 );
			VAL_MSUB_CONJ( _x5, vp[40], _b5 );
			VAL_MSUB_CONJ( _x6, vp[41], _b5 );
			VAL_MSUB_CONJ( _x0, vp[42], _b6 );
			VAL_MSUB_CONJ( _x1, vp[43], _b6 );
			VAL_MSUB_CONJ( _x2, vp[44], _b6 );
			VAL_MSUB_CONJ( _x3, vp[45], _b6 );
			VAL_MSUB_CONJ( _x4, vp[46], _b6 );
			VAL_MSUB_CONJ( _x5, vp[47], _b6 );
			VAL_MSUB_CONJ( _x6, vp[48], _b6 );
			VAL_MSUB_CONJ( _x0, vp[49], _b7 );
			VAL_MSUB_CONJ( _x1, vp[50], _b7 );
			VAL_MSUB_CONJ( _x2, vp[51], _b7 );
			VAL_MSUB_CONJ( _x3, vp[52], _b7 );
			VAL_MSUB_CONJ( _x4, vp[53], _b7 );
			VAL_MSUB_CONJ( _x5, vp[54], _b7 );
			VAL_MSUB_CONJ( _x6, vp[55], _b7 );
			VAL_ASSIGN( xp[0], _x0 );
			VAL_ASSIGN( xp[1], _x1 );
			VAL_ASSIGN( xp[2], _x2 );
			VAL_ASSIGN( xp[3], _x3 );
			VAL_ASSIGN( xp[4], _x4 );
			VAL_ASSIGN( xp[5], _x5 );
			VAL_ASSIGN( xp[6], _x6 );
		} /* K */

		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1], _b1 );
		VAL_ASSIGN( bp[2], _b2 );
		VAL_ASSIGN( bp[3], _b3 );
		VAL_ASSIGN( bp[4], _b4 );
		VAL_ASSIGN( bp[5], _b5 );
		VAL_ASSIGN( bp[6], _b6 );
		VAL_ASSIGN( bp[7], _b7 );
	} /* I */
} /* MBCSR_MatHermTrisolve_Upper_v1_aX_xs1 */

#endif /* IS_VAL_COMPLEX */
#if !IS_VAL_COMPLEX
	/** Synonym for pure real version */
	#define MBCSR_MatHermTrisolve_Upper_v1_aX_xsX MBCSR_MatTransTrisolve_Upper_v1_aX_xsX
#else /* IS_VAL_COMPLEX */

#if defined(DO_NAME_MANGLING)
	/** Mangled name for MBCSR_MatHermTrisolve_Upper_v1_aX_xsX. */
	#define MBCSR_MatHermTrisolve_Upper_v1_aX_xsX \
		MANGLE_MOD_(MBCSR_MatHermTrisolve_Upper_v1_aX_xsX_8x7)
#endif

#endif
#if IS_VAL_COMPLEX
/**
 *  \brief The \f$8\times 7\f$ MBCSR implementation
 *  of \f$\bar{U}^{-T}\cdot b\f$, where x has general-stride.
 */
void
MBCSR_MatHermTrisolve_Upper_v1_aX_xsX(
	oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict ptr, const oski_index_t* restrict ind,
	const oski_value_t* restrict val, const oski_value_t* restrict diag,
	oski_value_t alpha, oski_value_t* restrict x, oski_index_t incx )
{
	oski_index_t I;
	oski_value_t* bp = x + d0*incx;
	const oski_value_t* dp = diag;

	{
		oski_index_t m = M * 8;
		BLAS_xSCAL( &m, &alpha, x, &incx );
	}

	for( I = 1; I != M+1; I++, bp += 8*incx, dp += 8*8 )
	{
		oski_index_t K;
		REGISTER oski_value_t _b0;
		REGISTER oski_value_t _b1;
		REGISTER oski_value_t _b2;
		REGISTER oski_value_t _b3;
		REGISTER oski_value_t _b4;
		REGISTER oski_value_t _b5;
		REGISTER oski_value_t _b6;
		REGISTER oski_value_t _b7;
		const oski_value_t* vp = val + ptr[I-1]*8*7;

		VAL_ASSIGN( _b0, bp[0] );
		VAL_ASSIGN( _b1, bp[1*incx] );
		VAL_ASSIGN( _b2, bp[2*incx] );
		VAL_ASSIGN( _b3, bp[3*incx] );
		VAL_ASSIGN( _b4, bp[4*incx] );
		VAL_ASSIGN( _b5, bp[5*incx] );
		VAL_ASSIGN( _b6, bp[6*incx] );
		VAL_ASSIGN( _b7, bp[7*incx] );

		VAL_DIVEQ_CONJ( _b0, dp[0*8+0] );
		VAL_MSUB_CONJ( _b1, dp[0*8+1], _b0 );
		VAL_MSUB_CONJ( _b2, dp[0*8+2], _b0 );
		VAL_MSUB_CONJ( _b3, dp[0*8+3], _b0 );
		VAL_MSUB_CONJ( _b4, dp[0*8+4], _b0 );
		VAL_MSUB_CONJ( _b5, dp[0*8+5], _b0 );
		VAL_MSUB_CONJ( _b6, dp[0*8+6], _b0 );
		VAL_MSUB_CONJ( _b7, dp[0*8+7], _b0 );
		VAL_DIVEQ_CONJ( _b1, dp[1*8+1] );
		VAL_MSUB_CONJ( _b2, dp[1*8+2], _b1 );
		VAL_MSUB_CONJ( _b3, dp[1*8+3], _b1 );
		VAL_MSUB_CONJ( _b4, dp[1*8+4], _b1 );
		VAL_MSUB_CONJ( _b5, dp[1*8+5], _b1 );
		VAL_MSUB_CONJ( _b6, dp[1*8+6], _b1 );
		VAL_MSUB_CONJ( _b7, dp[1*8+7], _b1 );
		VAL_DIVEQ_CONJ( _b2, dp[2*8+2] );
		VAL_MSUB_CONJ( _b3, dp[2*8+3], _b2 );
		VAL_MSUB_CONJ( _b4, dp[2*8+4], _b2 );
		VAL_MSUB_CONJ( _b5, dp[2*8+5], _b2 );
		VAL_MSUB_CONJ( _b6, dp[2*8+6], _b2 );
		VAL_MSUB_CONJ( _b7, dp[2*8+7], _b2 );
		VAL_DIVEQ_CONJ( _b3, dp[3*8+3] );
		VAL_MSUB_CONJ( _b4, dp[3*8+4], _b3 );
		VAL_MSUB_CONJ( _b5, dp[3*8+5], _b3 );
		VAL_MSUB_CONJ( _b6, dp[3*8+6], _b3 );
		VAL_MSUB_CONJ( _b7, dp[3*8+7], _b3 );
		VAL_DIVEQ_CONJ( _b4, dp[4*8+4] );
		VAL_MSUB_CONJ( _b5, dp[4*8+5], _b4 );
		VAL_MSUB_CONJ( _b6, dp[4*8+6], _b4 );
		VAL_MSUB_CONJ( _b7, dp[4*8+7], _b4 );
		VAL_DIVEQ_CONJ( _b5, dp[5*8+5] );
		VAL_MSUB_CONJ( _b6, dp[5*8+6], _b5 );
		VAL_MSUB_CONJ( _b7, dp[5*8+7], _b5 );
		VAL_DIVEQ_CONJ( _b6, dp[6*8+6] );
		VAL_MSUB_CONJ( _b7, dp[6*8+7], _b6 );
		VAL_DIVEQ_CONJ( _b7, dp[7*8+7] );

		for( K = ptr[I-1]; K < ptr[I]; K++, vp += 8*7 )
		{
			oski_index_t j0 = ind[K];
			oski_value_t* xp = x + j0*incx;
			REGISTER oski_value_t _x0;
			REGISTER oski_value_t _x1;
			REGISTER oski_value_t _x2;
			REGISTER oski_value_t _x3;
			REGISTER oski_value_t _x4;
			REGISTER oski_value_t _x5;
			REGISTER oski_value_t _x6;

			VAL_ASSIGN( _x0, xp[0] );
			VAL_ASSIGN( _x1, xp[1*incx] );
			VAL_ASSIGN( _x2, xp[2*incx] );
			VAL_ASSIGN( _x3, xp[3*incx] );
			VAL_ASSIGN( _x4, xp[4*incx] );
			VAL_ASSIGN( _x5, xp[5*incx] );
			VAL_ASSIGN( _x6, xp[6*incx] );

			VAL_MSUB_CONJ( _x0, vp[0], _b0 );
			VAL_MSUB_CONJ( _x1, vp[1], _b0 );
			VAL_MSUB_CONJ( _x2, vp[2], _b0 );
			VAL_MSUB_CONJ( _x3, vp[3], _b0 );
			VAL_MSUB_CONJ( _x4, vp[4], _b0 );
			VAL_MSUB_CONJ( _x5, vp[5], _b0 );
			VAL_MSUB_CONJ( _x6, vp[6], _b0 );
			VAL_MSUB_CONJ( _x0, vp[7], _b1 );
			VAL_MSUB_CONJ( _x1, vp[8], _b1 );
			VAL_MSUB_CONJ( _x2, vp[9], _b1 );
			VAL_MSUB_CONJ( _x3, vp[10], _b1 );
			VAL_MSUB_CONJ( _x4, vp[11], _b1 );
			VAL_MSUB_CONJ( _x5, vp[12], _b1 );
			VAL_MSUB_CONJ( _x6, vp[13], _b1 );
			VAL_MSUB_CONJ( _x0, vp[14], _b2 );
			VAL_MSUB_CONJ( _x1, vp[15], _b2 );
			VAL_MSUB_CONJ( _x2, vp[16], _b2 );
			VAL_MSUB_CONJ( _x3, vp[17], _b2 );
			VAL_MSUB_CONJ( _x4, vp[18], _b2 );
			VAL_MSUB_CONJ( _x5, vp[19], _b2 );
			VAL_MSUB_CONJ( _x6, vp[20], _b2 );
			VAL_MSUB_CONJ( _x0, vp[21], _b3 );
			VAL_MSUB_CONJ( _x1, vp[22], _b3 );
			VAL_MSUB_CONJ( _x2, vp[23], _b3 );
			VAL_MSUB_CONJ( _x3, vp[24], _b3 );
			VAL_MSUB_CONJ( _x4, vp[25], _b3 );
			VAL_MSUB_CONJ( _x5, vp[26], _b3 );
			VAL_MSUB_CONJ( _x6, vp[27], _b3 );
			VAL_MSUB_CONJ( _x0, vp[28], _b4 );
			VAL_MSUB_CONJ( _x1, vp[29], _b4 );
			VAL_MSUB_CONJ( _x2, vp[30], _b4 );
			VAL_MSUB_CONJ( _x3, vp[31], _b4 );
			VAL_MSUB_CONJ( _x4, vp[32], _b4 );
			VAL_MSUB_CONJ( _x5, vp[33], _b4 );
			VAL_MSUB_CONJ( _x6, vp[34], _b4 );
			VAL_MSUB_CONJ( _x0, vp[35], _b5 );
			VAL_MSUB_CONJ( _x1, vp[36], _b5 );
			VAL_MSUB_CONJ( _x2, vp[37], _b5 );
			VAL_MSUB_CONJ( _x3, vp[38], _b5 );
			VAL_MSUB_CONJ( _x4, vp[39], _b5 );
			VAL_MSUB_CONJ( _x5, vp[40], _b5 );
			VAL_MSUB_CONJ( _x6, vp[41], _b5 );
			VAL_MSUB_CONJ( _x0, vp[42], _b6 );
			VAL_MSUB_CONJ( _x1, vp[43], _b6 );
			VAL_MSUB_CONJ( _x2, vp[44], _b6 );
			VAL_MSUB_CONJ( _x3, vp[45], _b6 );
			VAL_MSUB_CONJ( _x4, vp[46], _b6 );
			VAL_MSUB_CONJ( _x5, vp[47], _b6 );
			VAL_MSUB_CONJ( _x6, vp[48], _b6 );
			VAL_MSUB_CONJ( _x0, vp[49], _b7 );
			VAL_MSUB_CONJ( _x1, vp[50], _b7 );
			VAL_MSUB_CONJ( _x2, vp[51], _b7 );
			VAL_MSUB_CONJ( _x3, vp[52], _b7 );
			VAL_MSUB_CONJ( _x4, vp[53], _b7 );
			VAL_MSUB_CONJ( _x5, vp[54], _b7 );
			VAL_MSUB_CONJ( _x6, vp[55], _b7 );
			VAL_ASSIGN( xp[0], _x0 );
			VAL_ASSIGN( xp[1*incx], _x1 );
			VAL_ASSIGN( xp[2*incx], _x2 );
			VAL_ASSIGN( xp[3*incx], _x3 );
			VAL_ASSIGN( xp[4*incx], _x4 );
			VAL_ASSIGN( xp[5*incx], _x5 );
			VAL_ASSIGN( xp[6*incx], _x6 );
		} /* K */

		VAL_ASSIGN( bp[0], _b0 );
		VAL_ASSIGN( bp[1*incx], _b1 );
		VAL_ASSIGN( bp[2*incx], _b2 );
		VAL_ASSIGN( bp[3*incx], _b3 );
		VAL_ASSIGN( bp[4*incx], _b4 );
		VAL_ASSIGN( bp[5*incx], _b5 );
		VAL_ASSIGN( bp[6*incx], _b6 );
		VAL_ASSIGN( bp[7*incx], _b7 );
	} /* I */
} /* MBCSR_MatHermTrisolve_Upper_v1_aX_xsX */

#endif /* IS_VAL_COMPLEX */
/**
 *  \brief Exported module wrapper for the \f$8\times 7\f$
 *  implementation of the sparse triangular solve operation,
 *  where the matrix is upper triangular.
 */
static void
MatHermTrisolve_Upper( const oski_submatMBCSR_t* T,
	oski_value_t alpha, oski_vecview_t x )
{
	assert( T != NULL );
	assert( T->r == 8 );
	assert( T->c == 7 );
	assert( x != INVALID_VEC );

	if( x->rowinc == 1 ) {
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatHermTrisolve_Upper_v1_aX_xs1( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp );
	} else { /* x has general (non-unit) stride */
		oski_index_t j;
		oski_value_t* xp;
		for( j = 0, xp = x->val; j < x->num_cols; j++, xp += x->colinc )
			MBCSR_MatHermTrisolve_Upper_v1_aX_xsX( T->num_block_rows, T->offset,
				T->bptr, T->bind, T->bval, T->bdiag, alpha, xp, x->rowinc );
	}
}


#if defined(DO_NAME_MANGLING)
/** Mangled name for primary exportable symbol */
#define SubmatReprTrisolve MANGLE_MOD_(SubmatReprTrisolve_8x7)
#endif

/**
 *  \brief Entry point to the 8x7 kernel that implements
 *  simultaneous multiplication by sparse \f$A\f$ and
 *  \f$\mathrm{op}(A)\f$.
 */
int
SubmatReprTrisolve( const oski_submatMBCSR_t* T, int is_lower,
	oski_matop_t opT, oski_value_t alpha, oski_vecview_t x )
{
	int err = 0;
	if( is_lower )
		switch( opT ) {
			case OP_NORMAL:
				MatTrisolve_Lower( T, alpha, x );
				break;
			case OP_TRANS:
				MatTransTrisolve_Lower( T, alpha, x );
				break;
			case OP_CONJ:
				MatConjTrisolve_Lower( T, alpha, x );
				break;
			case OP_CONJ_TRANS:
				MatHermTrisolve_Lower( T, alpha, x );
				break;
			default:
				err = ERR_NOT_IMPLEMENTED;
		}
	else
		switch( opT ) {
			case OP_NORMAL:
				MatTrisolve_Upper( T, alpha, x );
				break;
			case OP_TRANS:
				MatTransTrisolve_Upper( T, alpha, x );
				break;
			case OP_CONJ:
				MatConjTrisolve_Upper( T, alpha, x );
				break;
			case OP_CONJ_TRANS:
				MatHermTrisolve_Upper( T, alpha, x );
				break;
			default:
				err = ERR_NOT_IMPLEMENTED;
		}
	return err;
}

/* eof */
