/**
 *  \file src/MBCSR/MatMult/7x8.c
 *  \brief MBCSR 7x8 SpMV implementation, for all transpose options.
 *  \ingroup MATTYPE_MBCSR
 *
 *  Automatically generated by ./gen_symm.sh on Wed Jun  8 16:00:21 PDT 2005.
 */

#if HAVE_CONFIG_H
#include <config/config.h> /* for 'restrict' keyword */
#endif

#include <assert.h>

#include <oski/common.h>
#include <oski/mangle.h>
#include <oski/vecview.h>
#include <oski/MBCSR/format.h>
#include <oski/MBCSR/module.h>

#if IS_VAL_COMPLEX
/** Complex-valued, so do not use explicit 'register' keyword */
#define REGISTER
#else
/** Real-valued, so use explicit 'register' keyword */
#define REGISTER register
#endif



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1 */
#define MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1 MANGLE_MOD_(MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7, xp += 7 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1] );
		VAL_MUL( _x2, alpha, xp[2] );
		VAL_MUL( _x3, alpha, xp[3] );
		VAL_MUL( _x4, alpha, xp[4] );
		VAL_MUL( _x5, alpha, xp[5] );
		VAL_MUL( _x6, alpha, xp[6] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC( _yp0, bval[8], _x1 );
			VAL_MAC( _yp1, bval[9], _x1 );
			VAL_MAC( _yp2, bval[10], _x1 );
			VAL_MAC( _yp3, bval[11], _x1 );
			VAL_MAC( _yp4, bval[12], _x1 );
			VAL_MAC( _yp5, bval[13], _x1 );
			VAL_MAC( _yp6, bval[14], _x1 );
			VAL_MAC( _yp7, bval[15], _x1 );
			VAL_MAC( _yp0, bval[16], _x2 );
			VAL_MAC( _yp1, bval[17], _x2 );
			VAL_MAC( _yp2, bval[18], _x2 );
			VAL_MAC( _yp3, bval[19], _x2 );
			VAL_MAC( _yp4, bval[20], _x2 );
			VAL_MAC( _yp5, bval[21], _x2 );
			VAL_MAC( _yp6, bval[22], _x2 );
			VAL_MAC( _yp7, bval[23], _x2 );
			VAL_MAC( _yp0, bval[24], _x3 );
			VAL_MAC( _yp1, bval[25], _x3 );
			VAL_MAC( _yp2, bval[26], _x3 );
			VAL_MAC( _yp3, bval[27], _x3 );
			VAL_MAC( _yp4, bval[28], _x3 );
			VAL_MAC( _yp5, bval[29], _x3 );
			VAL_MAC( _yp6, bval[30], _x3 );
			VAL_MAC( _yp7, bval[31], _x3 );
			VAL_MAC( _yp0, bval[32], _x4 );
			VAL_MAC( _yp1, bval[33], _x4 );
			VAL_MAC( _yp2, bval[34], _x4 );
			VAL_MAC( _yp3, bval[35], _x4 );
			VAL_MAC( _yp4, bval[36], _x4 );
			VAL_MAC( _yp5, bval[37], _x4 );
			VAL_MAC( _yp6, bval[38], _x4 );
			VAL_MAC( _yp7, bval[39], _x4 );
			VAL_MAC( _yp0, bval[40], _x5 );
			VAL_MAC( _yp1, bval[41], _x5 );
			VAL_MAC( _yp2, bval[42], _x5 );
			VAL_MAC( _yp3, bval[43], _x5 );
			VAL_MAC( _yp4, bval[44], _x5 );
			VAL_MAC( _yp5, bval[45], _x5 );
			VAL_MAC( _yp6, bval[46], _x5 );
			VAL_MAC( _yp7, bval[47], _x5 );
			VAL_MAC( _yp0, bval[48], _x6 );
			VAL_MAC( _yp1, bval[49], _x6 );
			VAL_MAC( _yp2, bval[50], _x6 );
			VAL_MAC( _yp3, bval[51], _x6 );
			VAL_MAC( _yp4, bval[52], _x6 );
			VAL_MAC( _yp5, bval[53], _x6 );
			VAL_MAC( _yp6, bval[54], _x6 );
			VAL_MAC( _yp7, bval[55], _x6 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y1, bval[8], _xp0 );
			VAL_MAC( _y2, bval[16], _xp0 );
			VAL_MAC( _y3, bval[24], _xp0 );
			VAL_MAC( _y4, bval[32], _xp0 );
			VAL_MAC( _y5, bval[40], _xp0 );
			VAL_MAC( _y6, bval[48], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y1, bval[9], _xp1 );
			VAL_MAC( _y2, bval[17], _xp1 );
			VAL_MAC( _y3, bval[25], _xp1 );
			VAL_MAC( _y4, bval[33], _xp1 );
			VAL_MAC( _y5, bval[41], _xp1 );
			VAL_MAC( _y6, bval[49], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y1, bval[10], _xp2 );
			VAL_MAC( _y2, bval[18], _xp2 );
			VAL_MAC( _y3, bval[26], _xp2 );
			VAL_MAC( _y4, bval[34], _xp2 );
			VAL_MAC( _y5, bval[42], _xp2 );
			VAL_MAC( _y6, bval[50], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y1, bval[11], _xp3 );
			VAL_MAC( _y2, bval[19], _xp3 );
			VAL_MAC( _y3, bval[27], _xp3 );
			VAL_MAC( _y4, bval[35], _xp3 );
			VAL_MAC( _y5, bval[43], _xp3 );
			VAL_MAC( _y6, bval[51], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y1, bval[12], _xp4 );
			VAL_MAC( _y2, bval[20], _xp4 );
			VAL_MAC( _y3, bval[28], _xp4 );
			VAL_MAC( _y4, bval[36], _xp4 );
			VAL_MAC( _y5, bval[44], _xp4 );
			VAL_MAC( _y6, bval[52], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y1, bval[13], _xp5 );
			VAL_MAC( _y2, bval[21], _xp5 );
			VAL_MAC( _y3, bval[29], _xp5 );
			VAL_MAC( _y4, bval[37], _xp5 );
			VAL_MAC( _y5, bval[45], _xp5 );
			VAL_MAC( _y6, bval[53], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y1, bval[14], _xp6 );
			VAL_MAC( _y2, bval[22], _xp6 );
			VAL_MAC( _y3, bval[30], _xp6 );
			VAL_MAC( _y4, bval[38], _xp6 );
			VAL_MAC( _y5, bval[46], _xp6 );
			VAL_MAC( _y6, bval[54], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_MAC( _y1, bval[15], _xp7 );
			VAL_MAC( _y2, bval[23], _xp7 );
			VAL_MAC( _y3, bval[31], _xp7 );
			VAL_MAC( _y4, bval[39], _xp7 );
			VAL_MAC( _y5, bval[47], _xp7 );
			VAL_MAC( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7, xp += 7 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1] );
		VAL_ASSIGN( _x2, xp[2] );
		VAL_ASSIGN( _x3, xp[3] );
		VAL_ASSIGN( _x4, xp[4] );
		VAL_ASSIGN( _x5, xp[5] );
		VAL_ASSIGN( _x6, xp[6] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( _y1, bdiag[7], _x0 );
		VAL_MAC( _y2, bdiag[14], _x0 );
		VAL_MAC( _y3, bdiag[21], _x0 );
		VAL_MAC( _y4, bdiag[28], _x0 );
		VAL_MAC( _y5, bdiag[35], _x0 );
		VAL_MAC( _y6, bdiag[42], _x0 );
		VAL_MAC( _y0, bdiag[1], _x1 );
		VAL_MAC( _y1, bdiag[8], _x1 );
		VAL_MAC( _y2, bdiag[15], _x1 );
		VAL_MAC( _y3, bdiag[22], _x1 );
		VAL_MAC( _y4, bdiag[29], _x1 );
		VAL_MAC( _y5, bdiag[36], _x1 );
		VAL_MAC( _y6, bdiag[43], _x1 );
		VAL_MAC( _y0, bdiag[2], _x2 );
		VAL_MAC( _y1, bdiag[9], _x2 );
		VAL_MAC( _y2, bdiag[16], _x2 );
		VAL_MAC( _y3, bdiag[23], _x2 );
		VAL_MAC( _y4, bdiag[30], _x2 );
		VAL_MAC( _y5, bdiag[37], _x2 );
		VAL_MAC( _y6, bdiag[44], _x2 );
		VAL_MAC( _y0, bdiag[3], _x3 );
		VAL_MAC( _y1, bdiag[10], _x3 );
		VAL_MAC( _y2, bdiag[17], _x3 );
		VAL_MAC( _y3, bdiag[24], _x3 );
		VAL_MAC( _y4, bdiag[31], _x3 );
		VAL_MAC( _y5, bdiag[38], _x3 );
		VAL_MAC( _y6, bdiag[45], _x3 );
		VAL_MAC( _y0, bdiag[4], _x4 );
		VAL_MAC( _y1, bdiag[11], _x4 );
		VAL_MAC( _y2, bdiag[18], _x4 );
		VAL_MAC( _y3, bdiag[25], _x4 );
		VAL_MAC( _y4, bdiag[32], _x4 );
		VAL_MAC( _y5, bdiag[39], _x4 );
		VAL_MAC( _y6, bdiag[46], _x4 );
		VAL_MAC( _y0, bdiag[5], _x5 );
		VAL_MAC( _y1, bdiag[12], _x5 );
		VAL_MAC( _y2, bdiag[19], _x5 );
		VAL_MAC( _y3, bdiag[26], _x5 );
		VAL_MAC( _y4, bdiag[33], _x5 );
		VAL_MAC( _y5, bdiag[40], _x5 );
		VAL_MAC( _y6, bdiag[47], _x5 );
		VAL_MAC( _y0, bdiag[6], _x6 );
		VAL_MAC( _y1, bdiag[13], _x6 );
		VAL_MAC( _y2, bdiag[20], _x6 );
		VAL_MAC( _y3, bdiag[27], _x6 );
		VAL_MAC( _y4, bdiag[34], _x6 );
		VAL_MAC( _y5, bdiag[41], _x6 );
		VAL_MAC( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
}


#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX */
#define MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX MANGLE_MOD_(MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7*incy, xp += 7 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1] );
		VAL_MUL( _x2, alpha, xp[2] );
		VAL_MUL( _x3, alpha, xp[3] );
		VAL_MUL( _x4, alpha, xp[4] );
		VAL_MUL( _x5, alpha, xp[5] );
		VAL_MUL( _x6, alpha, xp[6] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC( _yp0, bval[8], _x1 );
			VAL_MAC( _yp1, bval[9], _x1 );
			VAL_MAC( _yp2, bval[10], _x1 );
			VAL_MAC( _yp3, bval[11], _x1 );
			VAL_MAC( _yp4, bval[12], _x1 );
			VAL_MAC( _yp5, bval[13], _x1 );
			VAL_MAC( _yp6, bval[14], _x1 );
			VAL_MAC( _yp7, bval[15], _x1 );
			VAL_MAC( _yp0, bval[16], _x2 );
			VAL_MAC( _yp1, bval[17], _x2 );
			VAL_MAC( _yp2, bval[18], _x2 );
			VAL_MAC( _yp3, bval[19], _x2 );
			VAL_MAC( _yp4, bval[20], _x2 );
			VAL_MAC( _yp5, bval[21], _x2 );
			VAL_MAC( _yp6, bval[22], _x2 );
			VAL_MAC( _yp7, bval[23], _x2 );
			VAL_MAC( _yp0, bval[24], _x3 );
			VAL_MAC( _yp1, bval[25], _x3 );
			VAL_MAC( _yp2, bval[26], _x3 );
			VAL_MAC( _yp3, bval[27], _x3 );
			VAL_MAC( _yp4, bval[28], _x3 );
			VAL_MAC( _yp5, bval[29], _x3 );
			VAL_MAC( _yp6, bval[30], _x3 );
			VAL_MAC( _yp7, bval[31], _x3 );
			VAL_MAC( _yp0, bval[32], _x4 );
			VAL_MAC( _yp1, bval[33], _x4 );
			VAL_MAC( _yp2, bval[34], _x4 );
			VAL_MAC( _yp3, bval[35], _x4 );
			VAL_MAC( _yp4, bval[36], _x4 );
			VAL_MAC( _yp5, bval[37], _x4 );
			VAL_MAC( _yp6, bval[38], _x4 );
			VAL_MAC( _yp7, bval[39], _x4 );
			VAL_MAC( _yp0, bval[40], _x5 );
			VAL_MAC( _yp1, bval[41], _x5 );
			VAL_MAC( _yp2, bval[42], _x5 );
			VAL_MAC( _yp3, bval[43], _x5 );
			VAL_MAC( _yp4, bval[44], _x5 );
			VAL_MAC( _yp5, bval[45], _x5 );
			VAL_MAC( _yp6, bval[46], _x5 );
			VAL_MAC( _yp7, bval[47], _x5 );
			VAL_MAC( _yp0, bval[48], _x6 );
			VAL_MAC( _yp1, bval[49], _x6 );
			VAL_MAC( _yp2, bval[50], _x6 );
			VAL_MAC( _yp3, bval[51], _x6 );
			VAL_MAC( _yp4, bval[52], _x6 );
			VAL_MAC( _yp5, bval[53], _x6 );
			VAL_MAC( _yp6, bval[54], _x6 );
			VAL_MAC( _yp7, bval[55], _x6 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y1, bval[8], _xp0 );
			VAL_MAC( _y2, bval[16], _xp0 );
			VAL_MAC( _y3, bval[24], _xp0 );
			VAL_MAC( _y4, bval[32], _xp0 );
			VAL_MAC( _y5, bval[40], _xp0 );
			VAL_MAC( _y6, bval[48], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y1, bval[9], _xp1 );
			VAL_MAC( _y2, bval[17], _xp1 );
			VAL_MAC( _y3, bval[25], _xp1 );
			VAL_MAC( _y4, bval[33], _xp1 );
			VAL_MAC( _y5, bval[41], _xp1 );
			VAL_MAC( _y6, bval[49], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y1, bval[10], _xp2 );
			VAL_MAC( _y2, bval[18], _xp2 );
			VAL_MAC( _y3, bval[26], _xp2 );
			VAL_MAC( _y4, bval[34], _xp2 );
			VAL_MAC( _y5, bval[42], _xp2 );
			VAL_MAC( _y6, bval[50], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y1, bval[11], _xp3 );
			VAL_MAC( _y2, bval[19], _xp3 );
			VAL_MAC( _y3, bval[27], _xp3 );
			VAL_MAC( _y4, bval[35], _xp3 );
			VAL_MAC( _y5, bval[43], _xp3 );
			VAL_MAC( _y6, bval[51], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y1, bval[12], _xp4 );
			VAL_MAC( _y2, bval[20], _xp4 );
			VAL_MAC( _y3, bval[28], _xp4 );
			VAL_MAC( _y4, bval[36], _xp4 );
			VAL_MAC( _y5, bval[44], _xp4 );
			VAL_MAC( _y6, bval[52], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y1, bval[13], _xp5 );
			VAL_MAC( _y2, bval[21], _xp5 );
			VAL_MAC( _y3, bval[29], _xp5 );
			VAL_MAC( _y4, bval[37], _xp5 );
			VAL_MAC( _y5, bval[45], _xp5 );
			VAL_MAC( _y6, bval[53], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y1, bval[14], _xp6 );
			VAL_MAC( _y2, bval[22], _xp6 );
			VAL_MAC( _y3, bval[30], _xp6 );
			VAL_MAC( _y4, bval[38], _xp6 );
			VAL_MAC( _y5, bval[46], _xp6 );
			VAL_MAC( _y6, bval[54], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_MAC( _y1, bval[15], _xp7 );
			VAL_MAC( _y2, bval[23], _xp7 );
			VAL_MAC( _y3, bval[31], _xp7 );
			VAL_MAC( _y4, bval[39], _xp7 );
			VAL_MAC( _y5, bval[47], _xp7 );
			VAL_MAC( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7*incy, xp += 7 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1] );
		VAL_ASSIGN( _x2, xp[2] );
		VAL_ASSIGN( _x3, xp[3] );
		VAL_ASSIGN( _x4, xp[4] );
		VAL_ASSIGN( _x5, xp[5] );
		VAL_ASSIGN( _x6, xp[6] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( _y1, bdiag[7], _x0 );
		VAL_MAC( _y2, bdiag[14], _x0 );
		VAL_MAC( _y3, bdiag[21], _x0 );
		VAL_MAC( _y4, bdiag[28], _x0 );
		VAL_MAC( _y5, bdiag[35], _x0 );
		VAL_MAC( _y6, bdiag[42], _x0 );
		VAL_MAC( _y0, bdiag[1], _x1 );
		VAL_MAC( _y1, bdiag[8], _x1 );
		VAL_MAC( _y2, bdiag[15], _x1 );
		VAL_MAC( _y3, bdiag[22], _x1 );
		VAL_MAC( _y4, bdiag[29], _x1 );
		VAL_MAC( _y5, bdiag[36], _x1 );
		VAL_MAC( _y6, bdiag[43], _x1 );
		VAL_MAC( _y0, bdiag[2], _x2 );
		VAL_MAC( _y1, bdiag[9], _x2 );
		VAL_MAC( _y2, bdiag[16], _x2 );
		VAL_MAC( _y3, bdiag[23], _x2 );
		VAL_MAC( _y4, bdiag[30], _x2 );
		VAL_MAC( _y5, bdiag[37], _x2 );
		VAL_MAC( _y6, bdiag[44], _x2 );
		VAL_MAC( _y0, bdiag[3], _x3 );
		VAL_MAC( _y1, bdiag[10], _x3 );
		VAL_MAC( _y2, bdiag[17], _x3 );
		VAL_MAC( _y3, bdiag[24], _x3 );
		VAL_MAC( _y4, bdiag[31], _x3 );
		VAL_MAC( _y5, bdiag[38], _x3 );
		VAL_MAC( _y6, bdiag[45], _x3 );
		VAL_MAC( _y0, bdiag[4], _x4 );
		VAL_MAC( _y1, bdiag[11], _x4 );
		VAL_MAC( _y2, bdiag[18], _x4 );
		VAL_MAC( _y3, bdiag[25], _x4 );
		VAL_MAC( _y4, bdiag[32], _x4 );
		VAL_MAC( _y5, bdiag[39], _x4 );
		VAL_MAC( _y6, bdiag[46], _x4 );
		VAL_MAC( _y0, bdiag[5], _x5 );
		VAL_MAC( _y1, bdiag[12], _x5 );
		VAL_MAC( _y2, bdiag[19], _x5 );
		VAL_MAC( _y3, bdiag[26], _x5 );
		VAL_MAC( _y4, bdiag[33], _x5 );
		VAL_MAC( _y5, bdiag[40], _x5 );
		VAL_MAC( _y6, bdiag[47], _x5 );
		VAL_MAC( _y0, bdiag[6], _x6 );
		VAL_MAC( _y1, bdiag[13], _x6 );
		VAL_MAC( _y2, bdiag[20], _x6 );
		VAL_MAC( _y3, bdiag[27], _x6 );
		VAL_MAC( _y4, bdiag[34], _x6 );
		VAL_MAC( _y5, bdiag[41], _x6 );
		VAL_MAC( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
}


#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1 */
#define MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1 MANGLE_MOD_(MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7, xp += 7*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1*incx] );
		VAL_MUL( _x2, alpha, xp[2*incx] );
		VAL_MUL( _x3, alpha, xp[3*incx] );
		VAL_MUL( _x4, alpha, xp[4*incx] );
		VAL_MUL( _x5, alpha, xp[5*incx] );
		VAL_MUL( _x6, alpha, xp[6*incx] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC( _yp0, bval[8], _x1 );
			VAL_MAC( _yp1, bval[9], _x1 );
			VAL_MAC( _yp2, bval[10], _x1 );
			VAL_MAC( _yp3, bval[11], _x1 );
			VAL_MAC( _yp4, bval[12], _x1 );
			VAL_MAC( _yp5, bval[13], _x1 );
			VAL_MAC( _yp6, bval[14], _x1 );
			VAL_MAC( _yp7, bval[15], _x1 );
			VAL_MAC( _yp0, bval[16], _x2 );
			VAL_MAC( _yp1, bval[17], _x2 );
			VAL_MAC( _yp2, bval[18], _x2 );
			VAL_MAC( _yp3, bval[19], _x2 );
			VAL_MAC( _yp4, bval[20], _x2 );
			VAL_MAC( _yp5, bval[21], _x2 );
			VAL_MAC( _yp6, bval[22], _x2 );
			VAL_MAC( _yp7, bval[23], _x2 );
			VAL_MAC( _yp0, bval[24], _x3 );
			VAL_MAC( _yp1, bval[25], _x3 );
			VAL_MAC( _yp2, bval[26], _x3 );
			VAL_MAC( _yp3, bval[27], _x3 );
			VAL_MAC( _yp4, bval[28], _x3 );
			VAL_MAC( _yp5, bval[29], _x3 );
			VAL_MAC( _yp6, bval[30], _x3 );
			VAL_MAC( _yp7, bval[31], _x3 );
			VAL_MAC( _yp0, bval[32], _x4 );
			VAL_MAC( _yp1, bval[33], _x4 );
			VAL_MAC( _yp2, bval[34], _x4 );
			VAL_MAC( _yp3, bval[35], _x4 );
			VAL_MAC( _yp4, bval[36], _x4 );
			VAL_MAC( _yp5, bval[37], _x4 );
			VAL_MAC( _yp6, bval[38], _x4 );
			VAL_MAC( _yp7, bval[39], _x4 );
			VAL_MAC( _yp0, bval[40], _x5 );
			VAL_MAC( _yp1, bval[41], _x5 );
			VAL_MAC( _yp2, bval[42], _x5 );
			VAL_MAC( _yp3, bval[43], _x5 );
			VAL_MAC( _yp4, bval[44], _x5 );
			VAL_MAC( _yp5, bval[45], _x5 );
			VAL_MAC( _yp6, bval[46], _x5 );
			VAL_MAC( _yp7, bval[47], _x5 );
			VAL_MAC( _yp0, bval[48], _x6 );
			VAL_MAC( _yp1, bval[49], _x6 );
			VAL_MAC( _yp2, bval[50], _x6 );
			VAL_MAC( _yp3, bval[51], _x6 );
			VAL_MAC( _yp4, bval[52], _x6 );
			VAL_MAC( _yp5, bval[53], _x6 );
			VAL_MAC( _yp6, bval[54], _x6 );
			VAL_MAC( _yp7, bval[55], _x6 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y1, bval[8], _xp0 );
			VAL_MAC( _y2, bval[16], _xp0 );
			VAL_MAC( _y3, bval[24], _xp0 );
			VAL_MAC( _y4, bval[32], _xp0 );
			VAL_MAC( _y5, bval[40], _xp0 );
			VAL_MAC( _y6, bval[48], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y1, bval[9], _xp1 );
			VAL_MAC( _y2, bval[17], _xp1 );
			VAL_MAC( _y3, bval[25], _xp1 );
			VAL_MAC( _y4, bval[33], _xp1 );
			VAL_MAC( _y5, bval[41], _xp1 );
			VAL_MAC( _y6, bval[49], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y1, bval[10], _xp2 );
			VAL_MAC( _y2, bval[18], _xp2 );
			VAL_MAC( _y3, bval[26], _xp2 );
			VAL_MAC( _y4, bval[34], _xp2 );
			VAL_MAC( _y5, bval[42], _xp2 );
			VAL_MAC( _y6, bval[50], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y1, bval[11], _xp3 );
			VAL_MAC( _y2, bval[19], _xp3 );
			VAL_MAC( _y3, bval[27], _xp3 );
			VAL_MAC( _y4, bval[35], _xp3 );
			VAL_MAC( _y5, bval[43], _xp3 );
			VAL_MAC( _y6, bval[51], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y1, bval[12], _xp4 );
			VAL_MAC( _y2, bval[20], _xp4 );
			VAL_MAC( _y3, bval[28], _xp4 );
			VAL_MAC( _y4, bval[36], _xp4 );
			VAL_MAC( _y5, bval[44], _xp4 );
			VAL_MAC( _y6, bval[52], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y1, bval[13], _xp5 );
			VAL_MAC( _y2, bval[21], _xp5 );
			VAL_MAC( _y3, bval[29], _xp5 );
			VAL_MAC( _y4, bval[37], _xp5 );
			VAL_MAC( _y5, bval[45], _xp5 );
			VAL_MAC( _y6, bval[53], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y1, bval[14], _xp6 );
			VAL_MAC( _y2, bval[22], _xp6 );
			VAL_MAC( _y3, bval[30], _xp6 );
			VAL_MAC( _y4, bval[38], _xp6 );
			VAL_MAC( _y5, bval[46], _xp6 );
			VAL_MAC( _y6, bval[54], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_MAC( _y1, bval[15], _xp7 );
			VAL_MAC( _y2, bval[23], _xp7 );
			VAL_MAC( _y3, bval[31], _xp7 );
			VAL_MAC( _y4, bval[39], _xp7 );
			VAL_MAC( _y5, bval[47], _xp7 );
			VAL_MAC( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7, xp += 7*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1*incx] );
		VAL_ASSIGN( _x2, xp[2*incx] );
		VAL_ASSIGN( _x3, xp[3*incx] );
		VAL_ASSIGN( _x4, xp[4*incx] );
		VAL_ASSIGN( _x5, xp[5*incx] );
		VAL_ASSIGN( _x6, xp[6*incx] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( _y1, bdiag[7], _x0 );
		VAL_MAC( _y2, bdiag[14], _x0 );
		VAL_MAC( _y3, bdiag[21], _x0 );
		VAL_MAC( _y4, bdiag[28], _x0 );
		VAL_MAC( _y5, bdiag[35], _x0 );
		VAL_MAC( _y6, bdiag[42], _x0 );
		VAL_MAC( _y0, bdiag[1], _x1 );
		VAL_MAC( _y1, bdiag[8], _x1 );
		VAL_MAC( _y2, bdiag[15], _x1 );
		VAL_MAC( _y3, bdiag[22], _x1 );
		VAL_MAC( _y4, bdiag[29], _x1 );
		VAL_MAC( _y5, bdiag[36], _x1 );
		VAL_MAC( _y6, bdiag[43], _x1 );
		VAL_MAC( _y0, bdiag[2], _x2 );
		VAL_MAC( _y1, bdiag[9], _x2 );
		VAL_MAC( _y2, bdiag[16], _x2 );
		VAL_MAC( _y3, bdiag[23], _x2 );
		VAL_MAC( _y4, bdiag[30], _x2 );
		VAL_MAC( _y5, bdiag[37], _x2 );
		VAL_MAC( _y6, bdiag[44], _x2 );
		VAL_MAC( _y0, bdiag[3], _x3 );
		VAL_MAC( _y1, bdiag[10], _x3 );
		VAL_MAC( _y2, bdiag[17], _x3 );
		VAL_MAC( _y3, bdiag[24], _x3 );
		VAL_MAC( _y4, bdiag[31], _x3 );
		VAL_MAC( _y5, bdiag[38], _x3 );
		VAL_MAC( _y6, bdiag[45], _x3 );
		VAL_MAC( _y0, bdiag[4], _x4 );
		VAL_MAC( _y1, bdiag[11], _x4 );
		VAL_MAC( _y2, bdiag[18], _x4 );
		VAL_MAC( _y3, bdiag[25], _x4 );
		VAL_MAC( _y4, bdiag[32], _x4 );
		VAL_MAC( _y5, bdiag[39], _x4 );
		VAL_MAC( _y6, bdiag[46], _x4 );
		VAL_MAC( _y0, bdiag[5], _x5 );
		VAL_MAC( _y1, bdiag[12], _x5 );
		VAL_MAC( _y2, bdiag[19], _x5 );
		VAL_MAC( _y3, bdiag[26], _x5 );
		VAL_MAC( _y4, bdiag[33], _x5 );
		VAL_MAC( _y5, bdiag[40], _x5 );
		VAL_MAC( _y6, bdiag[47], _x5 );
		VAL_MAC( _y0, bdiag[6], _x6 );
		VAL_MAC( _y1, bdiag[13], _x6 );
		VAL_MAC( _y2, bdiag[20], _x6 );
		VAL_MAC( _y3, bdiag[27], _x6 );
		VAL_MAC( _y4, bdiag[34], _x6 );
		VAL_MAC( _y5, bdiag[41], _x6 );
		VAL_MAC( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
}


#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX */
#define MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX MANGLE_MOD_(MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7*incy, xp += 7*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1*incx] );
		VAL_MUL( _x2, alpha, xp[2*incx] );
		VAL_MUL( _x3, alpha, xp[3*incx] );
		VAL_MUL( _x4, alpha, xp[4*incx] );
		VAL_MUL( _x5, alpha, xp[5*incx] );
		VAL_MUL( _x6, alpha, xp[6*incx] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC( _yp0, bval[8], _x1 );
			VAL_MAC( _yp1, bval[9], _x1 );
			VAL_MAC( _yp2, bval[10], _x1 );
			VAL_MAC( _yp3, bval[11], _x1 );
			VAL_MAC( _yp4, bval[12], _x1 );
			VAL_MAC( _yp5, bval[13], _x1 );
			VAL_MAC( _yp6, bval[14], _x1 );
			VAL_MAC( _yp7, bval[15], _x1 );
			VAL_MAC( _yp0, bval[16], _x2 );
			VAL_MAC( _yp1, bval[17], _x2 );
			VAL_MAC( _yp2, bval[18], _x2 );
			VAL_MAC( _yp3, bval[19], _x2 );
			VAL_MAC( _yp4, bval[20], _x2 );
			VAL_MAC( _yp5, bval[21], _x2 );
			VAL_MAC( _yp6, bval[22], _x2 );
			VAL_MAC( _yp7, bval[23], _x2 );
			VAL_MAC( _yp0, bval[24], _x3 );
			VAL_MAC( _yp1, bval[25], _x3 );
			VAL_MAC( _yp2, bval[26], _x3 );
			VAL_MAC( _yp3, bval[27], _x3 );
			VAL_MAC( _yp4, bval[28], _x3 );
			VAL_MAC( _yp5, bval[29], _x3 );
			VAL_MAC( _yp6, bval[30], _x3 );
			VAL_MAC( _yp7, bval[31], _x3 );
			VAL_MAC( _yp0, bval[32], _x4 );
			VAL_MAC( _yp1, bval[33], _x4 );
			VAL_MAC( _yp2, bval[34], _x4 );
			VAL_MAC( _yp3, bval[35], _x4 );
			VAL_MAC( _yp4, bval[36], _x4 );
			VAL_MAC( _yp5, bval[37], _x4 );
			VAL_MAC( _yp6, bval[38], _x4 );
			VAL_MAC( _yp7, bval[39], _x4 );
			VAL_MAC( _yp0, bval[40], _x5 );
			VAL_MAC( _yp1, bval[41], _x5 );
			VAL_MAC( _yp2, bval[42], _x5 );
			VAL_MAC( _yp3, bval[43], _x5 );
			VAL_MAC( _yp4, bval[44], _x5 );
			VAL_MAC( _yp5, bval[45], _x5 );
			VAL_MAC( _yp6, bval[46], _x5 );
			VAL_MAC( _yp7, bval[47], _x5 );
			VAL_MAC( _yp0, bval[48], _x6 );
			VAL_MAC( _yp1, bval[49], _x6 );
			VAL_MAC( _yp2, bval[50], _x6 );
			VAL_MAC( _yp3, bval[51], _x6 );
			VAL_MAC( _yp4, bval[52], _x6 );
			VAL_MAC( _yp5, bval[53], _x6 );
			VAL_MAC( _yp6, bval[54], _x6 );
			VAL_MAC( _yp7, bval[55], _x6 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y1, bval[8], _xp0 );
			VAL_MAC( _y2, bval[16], _xp0 );
			VAL_MAC( _y3, bval[24], _xp0 );
			VAL_MAC( _y4, bval[32], _xp0 );
			VAL_MAC( _y5, bval[40], _xp0 );
			VAL_MAC( _y6, bval[48], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y1, bval[9], _xp1 );
			VAL_MAC( _y2, bval[17], _xp1 );
			VAL_MAC( _y3, bval[25], _xp1 );
			VAL_MAC( _y4, bval[33], _xp1 );
			VAL_MAC( _y5, bval[41], _xp1 );
			VAL_MAC( _y6, bval[49], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y1, bval[10], _xp2 );
			VAL_MAC( _y2, bval[18], _xp2 );
			VAL_MAC( _y3, bval[26], _xp2 );
			VAL_MAC( _y4, bval[34], _xp2 );
			VAL_MAC( _y5, bval[42], _xp2 );
			VAL_MAC( _y6, bval[50], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y1, bval[11], _xp3 );
			VAL_MAC( _y2, bval[19], _xp3 );
			VAL_MAC( _y3, bval[27], _xp3 );
			VAL_MAC( _y4, bval[35], _xp3 );
			VAL_MAC( _y5, bval[43], _xp3 );
			VAL_MAC( _y6, bval[51], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y1, bval[12], _xp4 );
			VAL_MAC( _y2, bval[20], _xp4 );
			VAL_MAC( _y3, bval[28], _xp4 );
			VAL_MAC( _y4, bval[36], _xp4 );
			VAL_MAC( _y5, bval[44], _xp4 );
			VAL_MAC( _y6, bval[52], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y1, bval[13], _xp5 );
			VAL_MAC( _y2, bval[21], _xp5 );
			VAL_MAC( _y3, bval[29], _xp5 );
			VAL_MAC( _y4, bval[37], _xp5 );
			VAL_MAC( _y5, bval[45], _xp5 );
			VAL_MAC( _y6, bval[53], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y1, bval[14], _xp6 );
			VAL_MAC( _y2, bval[22], _xp6 );
			VAL_MAC( _y3, bval[30], _xp6 );
			VAL_MAC( _y4, bval[38], _xp6 );
			VAL_MAC( _y5, bval[46], _xp6 );
			VAL_MAC( _y6, bval[54], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_MAC( _y1, bval[15], _xp7 );
			VAL_MAC( _y2, bval[23], _xp7 );
			VAL_MAC( _y3, bval[31], _xp7 );
			VAL_MAC( _y4, bval[39], _xp7 );
			VAL_MAC( _y5, bval[47], _xp7 );
			VAL_MAC( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7*incy, xp += 7*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1*incx] );
		VAL_ASSIGN( _x2, xp[2*incx] );
		VAL_ASSIGN( _x3, xp[3*incx] );
		VAL_ASSIGN( _x4, xp[4*incx] );
		VAL_ASSIGN( _x5, xp[5*incx] );
		VAL_ASSIGN( _x6, xp[6*incx] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( _y1, bdiag[7], _x0 );
		VAL_MAC( _y2, bdiag[14], _x0 );
		VAL_MAC( _y3, bdiag[21], _x0 );
		VAL_MAC( _y4, bdiag[28], _x0 );
		VAL_MAC( _y5, bdiag[35], _x0 );
		VAL_MAC( _y6, bdiag[42], _x0 );
		VAL_MAC( _y0, bdiag[1], _x1 );
		VAL_MAC( _y1, bdiag[8], _x1 );
		VAL_MAC( _y2, bdiag[15], _x1 );
		VAL_MAC( _y3, bdiag[22], _x1 );
		VAL_MAC( _y4, bdiag[29], _x1 );
		VAL_MAC( _y5, bdiag[36], _x1 );
		VAL_MAC( _y6, bdiag[43], _x1 );
		VAL_MAC( _y0, bdiag[2], _x2 );
		VAL_MAC( _y1, bdiag[9], _x2 );
		VAL_MAC( _y2, bdiag[16], _x2 );
		VAL_MAC( _y3, bdiag[23], _x2 );
		VAL_MAC( _y4, bdiag[30], _x2 );
		VAL_MAC( _y5, bdiag[37], _x2 );
		VAL_MAC( _y6, bdiag[44], _x2 );
		VAL_MAC( _y0, bdiag[3], _x3 );
		VAL_MAC( _y1, bdiag[10], _x3 );
		VAL_MAC( _y2, bdiag[17], _x3 );
		VAL_MAC( _y3, bdiag[24], _x3 );
		VAL_MAC( _y4, bdiag[31], _x3 );
		VAL_MAC( _y5, bdiag[38], _x3 );
		VAL_MAC( _y6, bdiag[45], _x3 );
		VAL_MAC( _y0, bdiag[4], _x4 );
		VAL_MAC( _y1, bdiag[11], _x4 );
		VAL_MAC( _y2, bdiag[18], _x4 );
		VAL_MAC( _y3, bdiag[25], _x4 );
		VAL_MAC( _y4, bdiag[32], _x4 );
		VAL_MAC( _y5, bdiag[39], _x4 );
		VAL_MAC( _y6, bdiag[46], _x4 );
		VAL_MAC( _y0, bdiag[5], _x5 );
		VAL_MAC( _y1, bdiag[12], _x5 );
		VAL_MAC( _y2, bdiag[19], _x5 );
		VAL_MAC( _y3, bdiag[26], _x5 );
		VAL_MAC( _y4, bdiag[33], _x5 );
		VAL_MAC( _y5, bdiag[40], _x5 );
		VAL_MAC( _y6, bdiag[47], _x5 );
		VAL_MAC( _y0, bdiag[6], _x6 );
		VAL_MAC( _y1, bdiag[13], _x6 );
		VAL_MAC( _y2, bdiag[20], _x6 );
		VAL_MAC( _y3, bdiag[27], _x6 );
		VAL_MAC( _y4, bdiag[34], _x6 );
		VAL_MAC( _y5, bdiag[41], _x6 );
		VAL_MAC( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
}


/**
 *  \brief Symmetric matrix times single-vector multiply in the normal case.
 */
static void
SymmMatMult_v1( oski_index_t M, oski_index_t d0,
	const oski_index_t* bptr, const oski_index_t* bind,
	const oski_value_t* bval, const oski_value_t* bdiag,
	oski_value_t alpha, const oski_value_t* x, oski_index_t incx,
	oski_value_t* y, oski_index_t incy )
{
	if( incx == 1 ) {
		if( incy == 1 ) {
			MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y );
		} else { /* general incy */
			MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y, incy );
		}
	} else { /* general incx */
		if( incy == 1 ) {
			MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y );
		} else { /* general incy */
			MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y, incy );
		}
	}
}


/**
 *  \brief Computes
 *  \f$y \leftarrow y + \alpha\cdot\mathrm{op}(A)\cdot x\f$,
 *  where \f$\mathrm{op}(A) = A\f$, on the fully blocked
 *  portion of \f$A\f$.
 */
static int
SymmMatMult( const oski_submatMBCSR_t* A,
	oski_value_t alpha, const oski_vecview_t x_view,
	oski_vecview_t y_view )
{
	oski_index_t j; /* column number */
	const oski_value_t* xpj; /* X(:, j) */
	oski_value_t* ypj; /* Y(:, j) */

	assert( A->r == 7 );
	assert( A->c == 8 );

	for( j = 0, xpj = x_view->val, ypj = y_view->val;
		j < x_view->num_cols;
		j++, xpj += x_view->colinc, ypj += y_view->colinc )
	{
		SymmMatMult_v1( A->num_block_rows, A->offset,
			A->bptr, A->bind, A->bval, A->bdiag,
			alpha, xpj, x_view->rowinc, ypj, y_view->rowinc );
	}

	return 0;
}


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1(). */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ys1 MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ys1 */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ys1 MANGLE_MOD_(MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ys1_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7, xp += 7 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1] );
		VAL_MUL( _x2, alpha, xp[2] );
		VAL_MUL( _x3, alpha, xp[3] );
		VAL_MUL( _x4, alpha, xp[4] );
		VAL_MUL( _x5, alpha, xp[5] );
		VAL_MUL( _x6, alpha, xp[6] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _yp0, bval[8], _x1 );
			VAL_MAC_CONJ( _yp1, bval[9], _x1 );
			VAL_MAC_CONJ( _yp2, bval[10], _x1 );
			VAL_MAC_CONJ( _yp3, bval[11], _x1 );
			VAL_MAC_CONJ( _yp4, bval[12], _x1 );
			VAL_MAC_CONJ( _yp5, bval[13], _x1 );
			VAL_MAC_CONJ( _yp6, bval[14], _x1 );
			VAL_MAC_CONJ( _yp7, bval[15], _x1 );
			VAL_MAC_CONJ( _yp0, bval[16], _x2 );
			VAL_MAC_CONJ( _yp1, bval[17], _x2 );
			VAL_MAC_CONJ( _yp2, bval[18], _x2 );
			VAL_MAC_CONJ( _yp3, bval[19], _x2 );
			VAL_MAC_CONJ( _yp4, bval[20], _x2 );
			VAL_MAC_CONJ( _yp5, bval[21], _x2 );
			VAL_MAC_CONJ( _yp6, bval[22], _x2 );
			VAL_MAC_CONJ( _yp7, bval[23], _x2 );
			VAL_MAC_CONJ( _yp0, bval[24], _x3 );
			VAL_MAC_CONJ( _yp1, bval[25], _x3 );
			VAL_MAC_CONJ( _yp2, bval[26], _x3 );
			VAL_MAC_CONJ( _yp3, bval[27], _x3 );
			VAL_MAC_CONJ( _yp4, bval[28], _x3 );
			VAL_MAC_CONJ( _yp5, bval[29], _x3 );
			VAL_MAC_CONJ( _yp6, bval[30], _x3 );
			VAL_MAC_CONJ( _yp7, bval[31], _x3 );
			VAL_MAC_CONJ( _yp0, bval[32], _x4 );
			VAL_MAC_CONJ( _yp1, bval[33], _x4 );
			VAL_MAC_CONJ( _yp2, bval[34], _x4 );
			VAL_MAC_CONJ( _yp3, bval[35], _x4 );
			VAL_MAC_CONJ( _yp4, bval[36], _x4 );
			VAL_MAC_CONJ( _yp5, bval[37], _x4 );
			VAL_MAC_CONJ( _yp6, bval[38], _x4 );
			VAL_MAC_CONJ( _yp7, bval[39], _x4 );
			VAL_MAC_CONJ( _yp0, bval[40], _x5 );
			VAL_MAC_CONJ( _yp1, bval[41], _x5 );
			VAL_MAC_CONJ( _yp2, bval[42], _x5 );
			VAL_MAC_CONJ( _yp3, bval[43], _x5 );
			VAL_MAC_CONJ( _yp4, bval[44], _x5 );
			VAL_MAC_CONJ( _yp5, bval[45], _x5 );
			VAL_MAC_CONJ( _yp6, bval[46], _x5 );
			VAL_MAC_CONJ( _yp7, bval[47], _x5 );
			VAL_MAC_CONJ( _yp0, bval[48], _x6 );
			VAL_MAC_CONJ( _yp1, bval[49], _x6 );
			VAL_MAC_CONJ( _yp2, bval[50], _x6 );
			VAL_MAC_CONJ( _yp3, bval[51], _x6 );
			VAL_MAC_CONJ( _yp4, bval[52], _x6 );
			VAL_MAC_CONJ( _yp5, bval[53], _x6 );
			VAL_MAC_CONJ( _yp6, bval[54], _x6 );
			VAL_MAC_CONJ( _yp7, bval[55], _x6 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y1, bval[8], _xp0 );
			VAL_MAC_CONJ( _y2, bval[16], _xp0 );
			VAL_MAC_CONJ( _y3, bval[24], _xp0 );
			VAL_MAC_CONJ( _y4, bval[32], _xp0 );
			VAL_MAC_CONJ( _y5, bval[40], _xp0 );
			VAL_MAC_CONJ( _y6, bval[48], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y1, bval[9], _xp1 );
			VAL_MAC_CONJ( _y2, bval[17], _xp1 );
			VAL_MAC_CONJ( _y3, bval[25], _xp1 );
			VAL_MAC_CONJ( _y4, bval[33], _xp1 );
			VAL_MAC_CONJ( _y5, bval[41], _xp1 );
			VAL_MAC_CONJ( _y6, bval[49], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y1, bval[10], _xp2 );
			VAL_MAC_CONJ( _y2, bval[18], _xp2 );
			VAL_MAC_CONJ( _y3, bval[26], _xp2 );
			VAL_MAC_CONJ( _y4, bval[34], _xp2 );
			VAL_MAC_CONJ( _y5, bval[42], _xp2 );
			VAL_MAC_CONJ( _y6, bval[50], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y1, bval[11], _xp3 );
			VAL_MAC_CONJ( _y2, bval[19], _xp3 );
			VAL_MAC_CONJ( _y3, bval[27], _xp3 );
			VAL_MAC_CONJ( _y4, bval[35], _xp3 );
			VAL_MAC_CONJ( _y5, bval[43], _xp3 );
			VAL_MAC_CONJ( _y6, bval[51], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y1, bval[12], _xp4 );
			VAL_MAC_CONJ( _y2, bval[20], _xp4 );
			VAL_MAC_CONJ( _y3, bval[28], _xp4 );
			VAL_MAC_CONJ( _y4, bval[36], _xp4 );
			VAL_MAC_CONJ( _y5, bval[44], _xp4 );
			VAL_MAC_CONJ( _y6, bval[52], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y1, bval[13], _xp5 );
			VAL_MAC_CONJ( _y2, bval[21], _xp5 );
			VAL_MAC_CONJ( _y3, bval[29], _xp5 );
			VAL_MAC_CONJ( _y4, bval[37], _xp5 );
			VAL_MAC_CONJ( _y5, bval[45], _xp5 );
			VAL_MAC_CONJ( _y6, bval[53], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y1, bval[14], _xp6 );
			VAL_MAC_CONJ( _y2, bval[22], _xp6 );
			VAL_MAC_CONJ( _y3, bval[30], _xp6 );
			VAL_MAC_CONJ( _y4, bval[38], _xp6 );
			VAL_MAC_CONJ( _y5, bval[46], _xp6 );
			VAL_MAC_CONJ( _y6, bval[54], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_MAC_CONJ( _y1, bval[15], _xp7 );
			VAL_MAC_CONJ( _y2, bval[23], _xp7 );
			VAL_MAC_CONJ( _y3, bval[31], _xp7 );
			VAL_MAC_CONJ( _y4, bval[39], _xp7 );
			VAL_MAC_CONJ( _y5, bval[47], _xp7 );
			VAL_MAC_CONJ( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7, xp += 7 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1] );
		VAL_ASSIGN( _x2, xp[2] );
		VAL_ASSIGN( _x3, xp[3] );
		VAL_ASSIGN( _x4, xp[4] );
		VAL_ASSIGN( _x5, xp[5] );
		VAL_ASSIGN( _x6, xp[6] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC_CONJ( _y1, bdiag[7], _x0 );
		VAL_MAC_CONJ( _y2, bdiag[14], _x0 );
		VAL_MAC_CONJ( _y3, bdiag[21], _x0 );
		VAL_MAC_CONJ( _y4, bdiag[28], _x0 );
		VAL_MAC_CONJ( _y5, bdiag[35], _x0 );
		VAL_MAC_CONJ( _y6, bdiag[42], _x0 );
		VAL_MAC_CONJ( _y0, bdiag[1], _x1 );
		VAL_MAC_CONJ( _y1, bdiag[8], _x1 );
		VAL_MAC_CONJ( _y2, bdiag[15], _x1 );
		VAL_MAC_CONJ( _y3, bdiag[22], _x1 );
		VAL_MAC_CONJ( _y4, bdiag[29], _x1 );
		VAL_MAC_CONJ( _y5, bdiag[36], _x1 );
		VAL_MAC_CONJ( _y6, bdiag[43], _x1 );
		VAL_MAC_CONJ( _y0, bdiag[2], _x2 );
		VAL_MAC_CONJ( _y1, bdiag[9], _x2 );
		VAL_MAC_CONJ( _y2, bdiag[16], _x2 );
		VAL_MAC_CONJ( _y3, bdiag[23], _x2 );
		VAL_MAC_CONJ( _y4, bdiag[30], _x2 );
		VAL_MAC_CONJ( _y5, bdiag[37], _x2 );
		VAL_MAC_CONJ( _y6, bdiag[44], _x2 );
		VAL_MAC_CONJ( _y0, bdiag[3], _x3 );
		VAL_MAC_CONJ( _y1, bdiag[10], _x3 );
		VAL_MAC_CONJ( _y2, bdiag[17], _x3 );
		VAL_MAC_CONJ( _y3, bdiag[24], _x3 );
		VAL_MAC_CONJ( _y4, bdiag[31], _x3 );
		VAL_MAC_CONJ( _y5, bdiag[38], _x3 );
		VAL_MAC_CONJ( _y6, bdiag[45], _x3 );
		VAL_MAC_CONJ( _y0, bdiag[4], _x4 );
		VAL_MAC_CONJ( _y1, bdiag[11], _x4 );
		VAL_MAC_CONJ( _y2, bdiag[18], _x4 );
		VAL_MAC_CONJ( _y3, bdiag[25], _x4 );
		VAL_MAC_CONJ( _y4, bdiag[32], _x4 );
		VAL_MAC_CONJ( _y5, bdiag[39], _x4 );
		VAL_MAC_CONJ( _y6, bdiag[46], _x4 );
		VAL_MAC_CONJ( _y0, bdiag[5], _x5 );
		VAL_MAC_CONJ( _y1, bdiag[12], _x5 );
		VAL_MAC_CONJ( _y2, bdiag[19], _x5 );
		VAL_MAC_CONJ( _y3, bdiag[26], _x5 );
		VAL_MAC_CONJ( _y4, bdiag[33], _x5 );
		VAL_MAC_CONJ( _y5, bdiag[40], _x5 );
		VAL_MAC_CONJ( _y6, bdiag[47], _x5 );
		VAL_MAC_CONJ( _y0, bdiag[6], _x6 );
		VAL_MAC_CONJ( _y1, bdiag[13], _x6 );
		VAL_MAC_CONJ( _y2, bdiag[20], _x6 );
		VAL_MAC_CONJ( _y3, bdiag[27], _x6 );
		VAL_MAC_CONJ( _y4, bdiag[34], _x6 );
		VAL_MAC_CONJ( _y5, bdiag[41], _x6 );
		VAL_MAC_CONJ( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX(). */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ysX MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ysX */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ysX MANGLE_MOD_(MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ysX_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7*incy, xp += 7 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1] );
		VAL_MUL( _x2, alpha, xp[2] );
		VAL_MUL( _x3, alpha, xp[3] );
		VAL_MUL( _x4, alpha, xp[4] );
		VAL_MUL( _x5, alpha, xp[5] );
		VAL_MUL( _x6, alpha, xp[6] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _yp0, bval[8], _x1 );
			VAL_MAC_CONJ( _yp1, bval[9], _x1 );
			VAL_MAC_CONJ( _yp2, bval[10], _x1 );
			VAL_MAC_CONJ( _yp3, bval[11], _x1 );
			VAL_MAC_CONJ( _yp4, bval[12], _x1 );
			VAL_MAC_CONJ( _yp5, bval[13], _x1 );
			VAL_MAC_CONJ( _yp6, bval[14], _x1 );
			VAL_MAC_CONJ( _yp7, bval[15], _x1 );
			VAL_MAC_CONJ( _yp0, bval[16], _x2 );
			VAL_MAC_CONJ( _yp1, bval[17], _x2 );
			VAL_MAC_CONJ( _yp2, bval[18], _x2 );
			VAL_MAC_CONJ( _yp3, bval[19], _x2 );
			VAL_MAC_CONJ( _yp4, bval[20], _x2 );
			VAL_MAC_CONJ( _yp5, bval[21], _x2 );
			VAL_MAC_CONJ( _yp6, bval[22], _x2 );
			VAL_MAC_CONJ( _yp7, bval[23], _x2 );
			VAL_MAC_CONJ( _yp0, bval[24], _x3 );
			VAL_MAC_CONJ( _yp1, bval[25], _x3 );
			VAL_MAC_CONJ( _yp2, bval[26], _x3 );
			VAL_MAC_CONJ( _yp3, bval[27], _x3 );
			VAL_MAC_CONJ( _yp4, bval[28], _x3 );
			VAL_MAC_CONJ( _yp5, bval[29], _x3 );
			VAL_MAC_CONJ( _yp6, bval[30], _x3 );
			VAL_MAC_CONJ( _yp7, bval[31], _x3 );
			VAL_MAC_CONJ( _yp0, bval[32], _x4 );
			VAL_MAC_CONJ( _yp1, bval[33], _x4 );
			VAL_MAC_CONJ( _yp2, bval[34], _x4 );
			VAL_MAC_CONJ( _yp3, bval[35], _x4 );
			VAL_MAC_CONJ( _yp4, bval[36], _x4 );
			VAL_MAC_CONJ( _yp5, bval[37], _x4 );
			VAL_MAC_CONJ( _yp6, bval[38], _x4 );
			VAL_MAC_CONJ( _yp7, bval[39], _x4 );
			VAL_MAC_CONJ( _yp0, bval[40], _x5 );
			VAL_MAC_CONJ( _yp1, bval[41], _x5 );
			VAL_MAC_CONJ( _yp2, bval[42], _x5 );
			VAL_MAC_CONJ( _yp3, bval[43], _x5 );
			VAL_MAC_CONJ( _yp4, bval[44], _x5 );
			VAL_MAC_CONJ( _yp5, bval[45], _x5 );
			VAL_MAC_CONJ( _yp6, bval[46], _x5 );
			VAL_MAC_CONJ( _yp7, bval[47], _x5 );
			VAL_MAC_CONJ( _yp0, bval[48], _x6 );
			VAL_MAC_CONJ( _yp1, bval[49], _x6 );
			VAL_MAC_CONJ( _yp2, bval[50], _x6 );
			VAL_MAC_CONJ( _yp3, bval[51], _x6 );
			VAL_MAC_CONJ( _yp4, bval[52], _x6 );
			VAL_MAC_CONJ( _yp5, bval[53], _x6 );
			VAL_MAC_CONJ( _yp6, bval[54], _x6 );
			VAL_MAC_CONJ( _yp7, bval[55], _x6 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y1, bval[8], _xp0 );
			VAL_MAC_CONJ( _y2, bval[16], _xp0 );
			VAL_MAC_CONJ( _y3, bval[24], _xp0 );
			VAL_MAC_CONJ( _y4, bval[32], _xp0 );
			VAL_MAC_CONJ( _y5, bval[40], _xp0 );
			VAL_MAC_CONJ( _y6, bval[48], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y1, bval[9], _xp1 );
			VAL_MAC_CONJ( _y2, bval[17], _xp1 );
			VAL_MAC_CONJ( _y3, bval[25], _xp1 );
			VAL_MAC_CONJ( _y4, bval[33], _xp1 );
			VAL_MAC_CONJ( _y5, bval[41], _xp1 );
			VAL_MAC_CONJ( _y6, bval[49], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y1, bval[10], _xp2 );
			VAL_MAC_CONJ( _y2, bval[18], _xp2 );
			VAL_MAC_CONJ( _y3, bval[26], _xp2 );
			VAL_MAC_CONJ( _y4, bval[34], _xp2 );
			VAL_MAC_CONJ( _y5, bval[42], _xp2 );
			VAL_MAC_CONJ( _y6, bval[50], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y1, bval[11], _xp3 );
			VAL_MAC_CONJ( _y2, bval[19], _xp3 );
			VAL_MAC_CONJ( _y3, bval[27], _xp3 );
			VAL_MAC_CONJ( _y4, bval[35], _xp3 );
			VAL_MAC_CONJ( _y5, bval[43], _xp3 );
			VAL_MAC_CONJ( _y6, bval[51], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y1, bval[12], _xp4 );
			VAL_MAC_CONJ( _y2, bval[20], _xp4 );
			VAL_MAC_CONJ( _y3, bval[28], _xp4 );
			VAL_MAC_CONJ( _y4, bval[36], _xp4 );
			VAL_MAC_CONJ( _y5, bval[44], _xp4 );
			VAL_MAC_CONJ( _y6, bval[52], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y1, bval[13], _xp5 );
			VAL_MAC_CONJ( _y2, bval[21], _xp5 );
			VAL_MAC_CONJ( _y3, bval[29], _xp5 );
			VAL_MAC_CONJ( _y4, bval[37], _xp5 );
			VAL_MAC_CONJ( _y5, bval[45], _xp5 );
			VAL_MAC_CONJ( _y6, bval[53], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y1, bval[14], _xp6 );
			VAL_MAC_CONJ( _y2, bval[22], _xp6 );
			VAL_MAC_CONJ( _y3, bval[30], _xp6 );
			VAL_MAC_CONJ( _y4, bval[38], _xp6 );
			VAL_MAC_CONJ( _y5, bval[46], _xp6 );
			VAL_MAC_CONJ( _y6, bval[54], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_MAC_CONJ( _y1, bval[15], _xp7 );
			VAL_MAC_CONJ( _y2, bval[23], _xp7 );
			VAL_MAC_CONJ( _y3, bval[31], _xp7 );
			VAL_MAC_CONJ( _y4, bval[39], _xp7 );
			VAL_MAC_CONJ( _y5, bval[47], _xp7 );
			VAL_MAC_CONJ( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7*incy, xp += 7 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1] );
		VAL_ASSIGN( _x2, xp[2] );
		VAL_ASSIGN( _x3, xp[3] );
		VAL_ASSIGN( _x4, xp[4] );
		VAL_ASSIGN( _x5, xp[5] );
		VAL_ASSIGN( _x6, xp[6] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC_CONJ( _y1, bdiag[7], _x0 );
		VAL_MAC_CONJ( _y2, bdiag[14], _x0 );
		VAL_MAC_CONJ( _y3, bdiag[21], _x0 );
		VAL_MAC_CONJ( _y4, bdiag[28], _x0 );
		VAL_MAC_CONJ( _y5, bdiag[35], _x0 );
		VAL_MAC_CONJ( _y6, bdiag[42], _x0 );
		VAL_MAC_CONJ( _y0, bdiag[1], _x1 );
		VAL_MAC_CONJ( _y1, bdiag[8], _x1 );
		VAL_MAC_CONJ( _y2, bdiag[15], _x1 );
		VAL_MAC_CONJ( _y3, bdiag[22], _x1 );
		VAL_MAC_CONJ( _y4, bdiag[29], _x1 );
		VAL_MAC_CONJ( _y5, bdiag[36], _x1 );
		VAL_MAC_CONJ( _y6, bdiag[43], _x1 );
		VAL_MAC_CONJ( _y0, bdiag[2], _x2 );
		VAL_MAC_CONJ( _y1, bdiag[9], _x2 );
		VAL_MAC_CONJ( _y2, bdiag[16], _x2 );
		VAL_MAC_CONJ( _y3, bdiag[23], _x2 );
		VAL_MAC_CONJ( _y4, bdiag[30], _x2 );
		VAL_MAC_CONJ( _y5, bdiag[37], _x2 );
		VAL_MAC_CONJ( _y6, bdiag[44], _x2 );
		VAL_MAC_CONJ( _y0, bdiag[3], _x3 );
		VAL_MAC_CONJ( _y1, bdiag[10], _x3 );
		VAL_MAC_CONJ( _y2, bdiag[17], _x3 );
		VAL_MAC_CONJ( _y3, bdiag[24], _x3 );
		VAL_MAC_CONJ( _y4, bdiag[31], _x3 );
		VAL_MAC_CONJ( _y5, bdiag[38], _x3 );
		VAL_MAC_CONJ( _y6, bdiag[45], _x3 );
		VAL_MAC_CONJ( _y0, bdiag[4], _x4 );
		VAL_MAC_CONJ( _y1, bdiag[11], _x4 );
		VAL_MAC_CONJ( _y2, bdiag[18], _x4 );
		VAL_MAC_CONJ( _y3, bdiag[25], _x4 );
		VAL_MAC_CONJ( _y4, bdiag[32], _x4 );
		VAL_MAC_CONJ( _y5, bdiag[39], _x4 );
		VAL_MAC_CONJ( _y6, bdiag[46], _x4 );
		VAL_MAC_CONJ( _y0, bdiag[5], _x5 );
		VAL_MAC_CONJ( _y1, bdiag[12], _x5 );
		VAL_MAC_CONJ( _y2, bdiag[19], _x5 );
		VAL_MAC_CONJ( _y3, bdiag[26], _x5 );
		VAL_MAC_CONJ( _y4, bdiag[33], _x5 );
		VAL_MAC_CONJ( _y5, bdiag[40], _x5 );
		VAL_MAC_CONJ( _y6, bdiag[47], _x5 );
		VAL_MAC_CONJ( _y0, bdiag[6], _x6 );
		VAL_MAC_CONJ( _y1, bdiag[13], _x6 );
		VAL_MAC_CONJ( _y2, bdiag[20], _x6 );
		VAL_MAC_CONJ( _y3, bdiag[27], _x6 );
		VAL_MAC_CONJ( _y4, bdiag[34], _x6 );
		VAL_MAC_CONJ( _y5, bdiag[41], _x6 );
		VAL_MAC_CONJ( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1(). */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ys1 MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ys1 */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ys1 MANGLE_MOD_(MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ys1_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7, xp += 7*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1*incx] );
		VAL_MUL( _x2, alpha, xp[2*incx] );
		VAL_MUL( _x3, alpha, xp[3*incx] );
		VAL_MUL( _x4, alpha, xp[4*incx] );
		VAL_MUL( _x5, alpha, xp[5*incx] );
		VAL_MUL( _x6, alpha, xp[6*incx] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _yp0, bval[8], _x1 );
			VAL_MAC_CONJ( _yp1, bval[9], _x1 );
			VAL_MAC_CONJ( _yp2, bval[10], _x1 );
			VAL_MAC_CONJ( _yp3, bval[11], _x1 );
			VAL_MAC_CONJ( _yp4, bval[12], _x1 );
			VAL_MAC_CONJ( _yp5, bval[13], _x1 );
			VAL_MAC_CONJ( _yp6, bval[14], _x1 );
			VAL_MAC_CONJ( _yp7, bval[15], _x1 );
			VAL_MAC_CONJ( _yp0, bval[16], _x2 );
			VAL_MAC_CONJ( _yp1, bval[17], _x2 );
			VAL_MAC_CONJ( _yp2, bval[18], _x2 );
			VAL_MAC_CONJ( _yp3, bval[19], _x2 );
			VAL_MAC_CONJ( _yp4, bval[20], _x2 );
			VAL_MAC_CONJ( _yp5, bval[21], _x2 );
			VAL_MAC_CONJ( _yp6, bval[22], _x2 );
			VAL_MAC_CONJ( _yp7, bval[23], _x2 );
			VAL_MAC_CONJ( _yp0, bval[24], _x3 );
			VAL_MAC_CONJ( _yp1, bval[25], _x3 );
			VAL_MAC_CONJ( _yp2, bval[26], _x3 );
			VAL_MAC_CONJ( _yp3, bval[27], _x3 );
			VAL_MAC_CONJ( _yp4, bval[28], _x3 );
			VAL_MAC_CONJ( _yp5, bval[29], _x3 );
			VAL_MAC_CONJ( _yp6, bval[30], _x3 );
			VAL_MAC_CONJ( _yp7, bval[31], _x3 );
			VAL_MAC_CONJ( _yp0, bval[32], _x4 );
			VAL_MAC_CONJ( _yp1, bval[33], _x4 );
			VAL_MAC_CONJ( _yp2, bval[34], _x4 );
			VAL_MAC_CONJ( _yp3, bval[35], _x4 );
			VAL_MAC_CONJ( _yp4, bval[36], _x4 );
			VAL_MAC_CONJ( _yp5, bval[37], _x4 );
			VAL_MAC_CONJ( _yp6, bval[38], _x4 );
			VAL_MAC_CONJ( _yp7, bval[39], _x4 );
			VAL_MAC_CONJ( _yp0, bval[40], _x5 );
			VAL_MAC_CONJ( _yp1, bval[41], _x5 );
			VAL_MAC_CONJ( _yp2, bval[42], _x5 );
			VAL_MAC_CONJ( _yp3, bval[43], _x5 );
			VAL_MAC_CONJ( _yp4, bval[44], _x5 );
			VAL_MAC_CONJ( _yp5, bval[45], _x5 );
			VAL_MAC_CONJ( _yp6, bval[46], _x5 );
			VAL_MAC_CONJ( _yp7, bval[47], _x5 );
			VAL_MAC_CONJ( _yp0, bval[48], _x6 );
			VAL_MAC_CONJ( _yp1, bval[49], _x6 );
			VAL_MAC_CONJ( _yp2, bval[50], _x6 );
			VAL_MAC_CONJ( _yp3, bval[51], _x6 );
			VAL_MAC_CONJ( _yp4, bval[52], _x6 );
			VAL_MAC_CONJ( _yp5, bval[53], _x6 );
			VAL_MAC_CONJ( _yp6, bval[54], _x6 );
			VAL_MAC_CONJ( _yp7, bval[55], _x6 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y1, bval[8], _xp0 );
			VAL_MAC_CONJ( _y2, bval[16], _xp0 );
			VAL_MAC_CONJ( _y3, bval[24], _xp0 );
			VAL_MAC_CONJ( _y4, bval[32], _xp0 );
			VAL_MAC_CONJ( _y5, bval[40], _xp0 );
			VAL_MAC_CONJ( _y6, bval[48], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y1, bval[9], _xp1 );
			VAL_MAC_CONJ( _y2, bval[17], _xp1 );
			VAL_MAC_CONJ( _y3, bval[25], _xp1 );
			VAL_MAC_CONJ( _y4, bval[33], _xp1 );
			VAL_MAC_CONJ( _y5, bval[41], _xp1 );
			VAL_MAC_CONJ( _y6, bval[49], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y1, bval[10], _xp2 );
			VAL_MAC_CONJ( _y2, bval[18], _xp2 );
			VAL_MAC_CONJ( _y3, bval[26], _xp2 );
			VAL_MAC_CONJ( _y4, bval[34], _xp2 );
			VAL_MAC_CONJ( _y5, bval[42], _xp2 );
			VAL_MAC_CONJ( _y6, bval[50], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y1, bval[11], _xp3 );
			VAL_MAC_CONJ( _y2, bval[19], _xp3 );
			VAL_MAC_CONJ( _y3, bval[27], _xp3 );
			VAL_MAC_CONJ( _y4, bval[35], _xp3 );
			VAL_MAC_CONJ( _y5, bval[43], _xp3 );
			VAL_MAC_CONJ( _y6, bval[51], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y1, bval[12], _xp4 );
			VAL_MAC_CONJ( _y2, bval[20], _xp4 );
			VAL_MAC_CONJ( _y3, bval[28], _xp4 );
			VAL_MAC_CONJ( _y4, bval[36], _xp4 );
			VAL_MAC_CONJ( _y5, bval[44], _xp4 );
			VAL_MAC_CONJ( _y6, bval[52], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y1, bval[13], _xp5 );
			VAL_MAC_CONJ( _y2, bval[21], _xp5 );
			VAL_MAC_CONJ( _y3, bval[29], _xp5 );
			VAL_MAC_CONJ( _y4, bval[37], _xp5 );
			VAL_MAC_CONJ( _y5, bval[45], _xp5 );
			VAL_MAC_CONJ( _y6, bval[53], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y1, bval[14], _xp6 );
			VAL_MAC_CONJ( _y2, bval[22], _xp6 );
			VAL_MAC_CONJ( _y3, bval[30], _xp6 );
			VAL_MAC_CONJ( _y4, bval[38], _xp6 );
			VAL_MAC_CONJ( _y5, bval[46], _xp6 );
			VAL_MAC_CONJ( _y6, bval[54], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_MAC_CONJ( _y1, bval[15], _xp7 );
			VAL_MAC_CONJ( _y2, bval[23], _xp7 );
			VAL_MAC_CONJ( _y3, bval[31], _xp7 );
			VAL_MAC_CONJ( _y4, bval[39], _xp7 );
			VAL_MAC_CONJ( _y5, bval[47], _xp7 );
			VAL_MAC_CONJ( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7, xp += 7*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1*incx] );
		VAL_ASSIGN( _x2, xp[2*incx] );
		VAL_ASSIGN( _x3, xp[3*incx] );
		VAL_ASSIGN( _x4, xp[4*incx] );
		VAL_ASSIGN( _x5, xp[5*incx] );
		VAL_ASSIGN( _x6, xp[6*incx] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC_CONJ( _y1, bdiag[7], _x0 );
		VAL_MAC_CONJ( _y2, bdiag[14], _x0 );
		VAL_MAC_CONJ( _y3, bdiag[21], _x0 );
		VAL_MAC_CONJ( _y4, bdiag[28], _x0 );
		VAL_MAC_CONJ( _y5, bdiag[35], _x0 );
		VAL_MAC_CONJ( _y6, bdiag[42], _x0 );
		VAL_MAC_CONJ( _y0, bdiag[1], _x1 );
		VAL_MAC_CONJ( _y1, bdiag[8], _x1 );
		VAL_MAC_CONJ( _y2, bdiag[15], _x1 );
		VAL_MAC_CONJ( _y3, bdiag[22], _x1 );
		VAL_MAC_CONJ( _y4, bdiag[29], _x1 );
		VAL_MAC_CONJ( _y5, bdiag[36], _x1 );
		VAL_MAC_CONJ( _y6, bdiag[43], _x1 );
		VAL_MAC_CONJ( _y0, bdiag[2], _x2 );
		VAL_MAC_CONJ( _y1, bdiag[9], _x2 );
		VAL_MAC_CONJ( _y2, bdiag[16], _x2 );
		VAL_MAC_CONJ( _y3, bdiag[23], _x2 );
		VAL_MAC_CONJ( _y4, bdiag[30], _x2 );
		VAL_MAC_CONJ( _y5, bdiag[37], _x2 );
		VAL_MAC_CONJ( _y6, bdiag[44], _x2 );
		VAL_MAC_CONJ( _y0, bdiag[3], _x3 );
		VAL_MAC_CONJ( _y1, bdiag[10], _x3 );
		VAL_MAC_CONJ( _y2, bdiag[17], _x3 );
		VAL_MAC_CONJ( _y3, bdiag[24], _x3 );
		VAL_MAC_CONJ( _y4, bdiag[31], _x3 );
		VAL_MAC_CONJ( _y5, bdiag[38], _x3 );
		VAL_MAC_CONJ( _y6, bdiag[45], _x3 );
		VAL_MAC_CONJ( _y0, bdiag[4], _x4 );
		VAL_MAC_CONJ( _y1, bdiag[11], _x4 );
		VAL_MAC_CONJ( _y2, bdiag[18], _x4 );
		VAL_MAC_CONJ( _y3, bdiag[25], _x4 );
		VAL_MAC_CONJ( _y4, bdiag[32], _x4 );
		VAL_MAC_CONJ( _y5, bdiag[39], _x4 );
		VAL_MAC_CONJ( _y6, bdiag[46], _x4 );
		VAL_MAC_CONJ( _y0, bdiag[5], _x5 );
		VAL_MAC_CONJ( _y1, bdiag[12], _x5 );
		VAL_MAC_CONJ( _y2, bdiag[19], _x5 );
		VAL_MAC_CONJ( _y3, bdiag[26], _x5 );
		VAL_MAC_CONJ( _y4, bdiag[33], _x5 );
		VAL_MAC_CONJ( _y5, bdiag[40], _x5 );
		VAL_MAC_CONJ( _y6, bdiag[47], _x5 );
		VAL_MAC_CONJ( _y0, bdiag[6], _x6 );
		VAL_MAC_CONJ( _y1, bdiag[13], _x6 );
		VAL_MAC_CONJ( _y2, bdiag[20], _x6 );
		VAL_MAC_CONJ( _y3, bdiag[27], _x6 );
		VAL_MAC_CONJ( _y4, bdiag[34], _x6 );
		VAL_MAC_CONJ( _y5, bdiag[41], _x6 );
		VAL_MAC_CONJ( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX(). */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ysX MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ysX */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ysX MANGLE_MOD_(MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ysX_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7*incy, xp += 7*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1*incx] );
		VAL_MUL( _x2, alpha, xp[2*incx] );
		VAL_MUL( _x3, alpha, xp[3*incx] );
		VAL_MUL( _x4, alpha, xp[4*incx] );
		VAL_MUL( _x5, alpha, xp[5*incx] );
		VAL_MUL( _x6, alpha, xp[6*incx] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _yp0, bval[8], _x1 );
			VAL_MAC_CONJ( _yp1, bval[9], _x1 );
			VAL_MAC_CONJ( _yp2, bval[10], _x1 );
			VAL_MAC_CONJ( _yp3, bval[11], _x1 );
			VAL_MAC_CONJ( _yp4, bval[12], _x1 );
			VAL_MAC_CONJ( _yp5, bval[13], _x1 );
			VAL_MAC_CONJ( _yp6, bval[14], _x1 );
			VAL_MAC_CONJ( _yp7, bval[15], _x1 );
			VAL_MAC_CONJ( _yp0, bval[16], _x2 );
			VAL_MAC_CONJ( _yp1, bval[17], _x2 );
			VAL_MAC_CONJ( _yp2, bval[18], _x2 );
			VAL_MAC_CONJ( _yp3, bval[19], _x2 );
			VAL_MAC_CONJ( _yp4, bval[20], _x2 );
			VAL_MAC_CONJ( _yp5, bval[21], _x2 );
			VAL_MAC_CONJ( _yp6, bval[22], _x2 );
			VAL_MAC_CONJ( _yp7, bval[23], _x2 );
			VAL_MAC_CONJ( _yp0, bval[24], _x3 );
			VAL_MAC_CONJ( _yp1, bval[25], _x3 );
			VAL_MAC_CONJ( _yp2, bval[26], _x3 );
			VAL_MAC_CONJ( _yp3, bval[27], _x3 );
			VAL_MAC_CONJ( _yp4, bval[28], _x3 );
			VAL_MAC_CONJ( _yp5, bval[29], _x3 );
			VAL_MAC_CONJ( _yp6, bval[30], _x3 );
			VAL_MAC_CONJ( _yp7, bval[31], _x3 );
			VAL_MAC_CONJ( _yp0, bval[32], _x4 );
			VAL_MAC_CONJ( _yp1, bval[33], _x4 );
			VAL_MAC_CONJ( _yp2, bval[34], _x4 );
			VAL_MAC_CONJ( _yp3, bval[35], _x4 );
			VAL_MAC_CONJ( _yp4, bval[36], _x4 );
			VAL_MAC_CONJ( _yp5, bval[37], _x4 );
			VAL_MAC_CONJ( _yp6, bval[38], _x4 );
			VAL_MAC_CONJ( _yp7, bval[39], _x4 );
			VAL_MAC_CONJ( _yp0, bval[40], _x5 );
			VAL_MAC_CONJ( _yp1, bval[41], _x5 );
			VAL_MAC_CONJ( _yp2, bval[42], _x5 );
			VAL_MAC_CONJ( _yp3, bval[43], _x5 );
			VAL_MAC_CONJ( _yp4, bval[44], _x5 );
			VAL_MAC_CONJ( _yp5, bval[45], _x5 );
			VAL_MAC_CONJ( _yp6, bval[46], _x5 );
			VAL_MAC_CONJ( _yp7, bval[47], _x5 );
			VAL_MAC_CONJ( _yp0, bval[48], _x6 );
			VAL_MAC_CONJ( _yp1, bval[49], _x6 );
			VAL_MAC_CONJ( _yp2, bval[50], _x6 );
			VAL_MAC_CONJ( _yp3, bval[51], _x6 );
			VAL_MAC_CONJ( _yp4, bval[52], _x6 );
			VAL_MAC_CONJ( _yp5, bval[53], _x6 );
			VAL_MAC_CONJ( _yp6, bval[54], _x6 );
			VAL_MAC_CONJ( _yp7, bval[55], _x6 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y1, bval[8], _xp0 );
			VAL_MAC_CONJ( _y2, bval[16], _xp0 );
			VAL_MAC_CONJ( _y3, bval[24], _xp0 );
			VAL_MAC_CONJ( _y4, bval[32], _xp0 );
			VAL_MAC_CONJ( _y5, bval[40], _xp0 );
			VAL_MAC_CONJ( _y6, bval[48], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y1, bval[9], _xp1 );
			VAL_MAC_CONJ( _y2, bval[17], _xp1 );
			VAL_MAC_CONJ( _y3, bval[25], _xp1 );
			VAL_MAC_CONJ( _y4, bval[33], _xp1 );
			VAL_MAC_CONJ( _y5, bval[41], _xp1 );
			VAL_MAC_CONJ( _y6, bval[49], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y1, bval[10], _xp2 );
			VAL_MAC_CONJ( _y2, bval[18], _xp2 );
			VAL_MAC_CONJ( _y3, bval[26], _xp2 );
			VAL_MAC_CONJ( _y4, bval[34], _xp2 );
			VAL_MAC_CONJ( _y5, bval[42], _xp2 );
			VAL_MAC_CONJ( _y6, bval[50], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y1, bval[11], _xp3 );
			VAL_MAC_CONJ( _y2, bval[19], _xp3 );
			VAL_MAC_CONJ( _y3, bval[27], _xp3 );
			VAL_MAC_CONJ( _y4, bval[35], _xp3 );
			VAL_MAC_CONJ( _y5, bval[43], _xp3 );
			VAL_MAC_CONJ( _y6, bval[51], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y1, bval[12], _xp4 );
			VAL_MAC_CONJ( _y2, bval[20], _xp4 );
			VAL_MAC_CONJ( _y3, bval[28], _xp4 );
			VAL_MAC_CONJ( _y4, bval[36], _xp4 );
			VAL_MAC_CONJ( _y5, bval[44], _xp4 );
			VAL_MAC_CONJ( _y6, bval[52], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y1, bval[13], _xp5 );
			VAL_MAC_CONJ( _y2, bval[21], _xp5 );
			VAL_MAC_CONJ( _y3, bval[29], _xp5 );
			VAL_MAC_CONJ( _y4, bval[37], _xp5 );
			VAL_MAC_CONJ( _y5, bval[45], _xp5 );
			VAL_MAC_CONJ( _y6, bval[53], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y1, bval[14], _xp6 );
			VAL_MAC_CONJ( _y2, bval[22], _xp6 );
			VAL_MAC_CONJ( _y3, bval[30], _xp6 );
			VAL_MAC_CONJ( _y4, bval[38], _xp6 );
			VAL_MAC_CONJ( _y5, bval[46], _xp6 );
			VAL_MAC_CONJ( _y6, bval[54], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_MAC_CONJ( _y1, bval[15], _xp7 );
			VAL_MAC_CONJ( _y2, bval[23], _xp7 );
			VAL_MAC_CONJ( _y3, bval[31], _xp7 );
			VAL_MAC_CONJ( _y4, bval[39], _xp7 );
			VAL_MAC_CONJ( _y5, bval[47], _xp7 );
			VAL_MAC_CONJ( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7*incy, xp += 7*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1*incx] );
		VAL_ASSIGN( _x2, xp[2*incx] );
		VAL_ASSIGN( _x3, xp[3*incx] );
		VAL_ASSIGN( _x4, xp[4*incx] );
		VAL_ASSIGN( _x5, xp[5*incx] );
		VAL_ASSIGN( _x6, xp[6*incx] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC_CONJ( _y1, bdiag[7], _x0 );
		VAL_MAC_CONJ( _y2, bdiag[14], _x0 );
		VAL_MAC_CONJ( _y3, bdiag[21], _x0 );
		VAL_MAC_CONJ( _y4, bdiag[28], _x0 );
		VAL_MAC_CONJ( _y5, bdiag[35], _x0 );
		VAL_MAC_CONJ( _y6, bdiag[42], _x0 );
		VAL_MAC_CONJ( _y0, bdiag[1], _x1 );
		VAL_MAC_CONJ( _y1, bdiag[8], _x1 );
		VAL_MAC_CONJ( _y2, bdiag[15], _x1 );
		VAL_MAC_CONJ( _y3, bdiag[22], _x1 );
		VAL_MAC_CONJ( _y4, bdiag[29], _x1 );
		VAL_MAC_CONJ( _y5, bdiag[36], _x1 );
		VAL_MAC_CONJ( _y6, bdiag[43], _x1 );
		VAL_MAC_CONJ( _y0, bdiag[2], _x2 );
		VAL_MAC_CONJ( _y1, bdiag[9], _x2 );
		VAL_MAC_CONJ( _y2, bdiag[16], _x2 );
		VAL_MAC_CONJ( _y3, bdiag[23], _x2 );
		VAL_MAC_CONJ( _y4, bdiag[30], _x2 );
		VAL_MAC_CONJ( _y5, bdiag[37], _x2 );
		VAL_MAC_CONJ( _y6, bdiag[44], _x2 );
		VAL_MAC_CONJ( _y0, bdiag[3], _x3 );
		VAL_MAC_CONJ( _y1, bdiag[10], _x3 );
		VAL_MAC_CONJ( _y2, bdiag[17], _x3 );
		VAL_MAC_CONJ( _y3, bdiag[24], _x3 );
		VAL_MAC_CONJ( _y4, bdiag[31], _x3 );
		VAL_MAC_CONJ( _y5, bdiag[38], _x3 );
		VAL_MAC_CONJ( _y6, bdiag[45], _x3 );
		VAL_MAC_CONJ( _y0, bdiag[4], _x4 );
		VAL_MAC_CONJ( _y1, bdiag[11], _x4 );
		VAL_MAC_CONJ( _y2, bdiag[18], _x4 );
		VAL_MAC_CONJ( _y3, bdiag[25], _x4 );
		VAL_MAC_CONJ( _y4, bdiag[32], _x4 );
		VAL_MAC_CONJ( _y5, bdiag[39], _x4 );
		VAL_MAC_CONJ( _y6, bdiag[46], _x4 );
		VAL_MAC_CONJ( _y0, bdiag[5], _x5 );
		VAL_MAC_CONJ( _y1, bdiag[12], _x5 );
		VAL_MAC_CONJ( _y2, bdiag[19], _x5 );
		VAL_MAC_CONJ( _y3, bdiag[26], _x5 );
		VAL_MAC_CONJ( _y4, bdiag[33], _x5 );
		VAL_MAC_CONJ( _y5, bdiag[40], _x5 );
		VAL_MAC_CONJ( _y6, bdiag[47], _x5 );
		VAL_MAC_CONJ( _y0, bdiag[6], _x6 );
		VAL_MAC_CONJ( _y1, bdiag[13], _x6 );
		VAL_MAC_CONJ( _y2, bdiag[20], _x6 );
		VAL_MAC_CONJ( _y3, bdiag[27], _x6 );
		VAL_MAC_CONJ( _y4, bdiag[34], _x6 );
		VAL_MAC_CONJ( _y5, bdiag[41], _x6 );
		VAL_MAC_CONJ( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX
/**
 *  \brief Matrix times single-vector multiply in the conj case;
 *  see SymmMatMult_v1().
 */
#define SymmMatConjMult_v1 SymmMatMult_v1

#else /* IS_VAL_COMPLEX */


/**
 *  \brief Symmetric matrix times single-vector multiply in the conj case.
 */
static void
SymmMatConjMult_v1( oski_index_t M, oski_index_t d0,
	const oski_index_t* bptr, const oski_index_t* bind,
	const oski_value_t* bval, const oski_value_t* bdiag,
	oski_value_t alpha, const oski_value_t* x, oski_index_t incx,
	oski_value_t* y, oski_index_t incy )
{
	if( incx == 1 ) {
		if( incy == 1 ) {
			MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y );
		} else { /* general incy */
			MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y, incy );
		}
	} else { /* general incx */
		if( incy == 1 ) {
			MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y );
		} else { /* general incy */
			MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y, incy );
		}
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX
/**
 *  \brief See SymmMatMult().
 */
#define SymmMatConjMult SymmMatMult

#else /* IS_VAL_COMPLEX */



/**
 *  \brief Computes
 *  \f$y \leftarrow y + \alpha\cdot\mathrm{op}(A)\cdot x\f$,
 *  where \f$\mathrm{op}(A) = \bar{A}\f$, on the fully blocked
 *  portion of \f$A\f$.
 */
static int
SymmMatConjMult( const oski_submatMBCSR_t* A,
	oski_value_t alpha, const oski_vecview_t x_view,
	oski_vecview_t y_view )
{
	oski_index_t j; /* column number */
	const oski_value_t* xpj; /* X(:, j) */
	oski_value_t* ypj; /* Y(:, j) */

	assert( A->r == 7 );
	assert( A->c == 8 );

	for( j = 0, xpj = x_view->val, ypj = y_view->val;
		j < x_view->num_cols;
		j++, xpj += x_view->colinc, ypj += y_view->colinc )
	{
		SymmMatConjMult_v1( A->num_block_rows, A->offset,
			A->bptr, A->bind, A->bval, A->bdiag,
			alpha, xpj, x_view->rowinc, ypj, y_view->rowinc );
	}

	return 0;
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1(). */
#define MBCSR_HermMatMult_v1_aX_b1_xs1_ys1 MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatMult_v1_aX_b1_xs1_ys1 */
#define MBCSR_HermMatMult_v1_aX_b1_xs1_ys1 MANGLE_MOD_(MBCSR_HermMatMult_v1_aX_b1_xs1_ys1_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_HermMatMult_v1_aX_b1_xs1_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7, xp += 7 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1] );
		VAL_MUL( _x2, alpha, xp[2] );
		VAL_MUL( _x3, alpha, xp[3] );
		VAL_MUL( _x4, alpha, xp[4] );
		VAL_MUL( _x5, alpha, xp[5] );
		VAL_MUL( _x6, alpha, xp[6] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _yp0, bval[8], _x1 );
			VAL_MAC_CONJ( _yp1, bval[9], _x1 );
			VAL_MAC_CONJ( _yp2, bval[10], _x1 );
			VAL_MAC_CONJ( _yp3, bval[11], _x1 );
			VAL_MAC_CONJ( _yp4, bval[12], _x1 );
			VAL_MAC_CONJ( _yp5, bval[13], _x1 );
			VAL_MAC_CONJ( _yp6, bval[14], _x1 );
			VAL_MAC_CONJ( _yp7, bval[15], _x1 );
			VAL_MAC_CONJ( _yp0, bval[16], _x2 );
			VAL_MAC_CONJ( _yp1, bval[17], _x2 );
			VAL_MAC_CONJ( _yp2, bval[18], _x2 );
			VAL_MAC_CONJ( _yp3, bval[19], _x2 );
			VAL_MAC_CONJ( _yp4, bval[20], _x2 );
			VAL_MAC_CONJ( _yp5, bval[21], _x2 );
			VAL_MAC_CONJ( _yp6, bval[22], _x2 );
			VAL_MAC_CONJ( _yp7, bval[23], _x2 );
			VAL_MAC_CONJ( _yp0, bval[24], _x3 );
			VAL_MAC_CONJ( _yp1, bval[25], _x3 );
			VAL_MAC_CONJ( _yp2, bval[26], _x3 );
			VAL_MAC_CONJ( _yp3, bval[27], _x3 );
			VAL_MAC_CONJ( _yp4, bval[28], _x3 );
			VAL_MAC_CONJ( _yp5, bval[29], _x3 );
			VAL_MAC_CONJ( _yp6, bval[30], _x3 );
			VAL_MAC_CONJ( _yp7, bval[31], _x3 );
			VAL_MAC_CONJ( _yp0, bval[32], _x4 );
			VAL_MAC_CONJ( _yp1, bval[33], _x4 );
			VAL_MAC_CONJ( _yp2, bval[34], _x4 );
			VAL_MAC_CONJ( _yp3, bval[35], _x4 );
			VAL_MAC_CONJ( _yp4, bval[36], _x4 );
			VAL_MAC_CONJ( _yp5, bval[37], _x4 );
			VAL_MAC_CONJ( _yp6, bval[38], _x4 );
			VAL_MAC_CONJ( _yp7, bval[39], _x4 );
			VAL_MAC_CONJ( _yp0, bval[40], _x5 );
			VAL_MAC_CONJ( _yp1, bval[41], _x5 );
			VAL_MAC_CONJ( _yp2, bval[42], _x5 );
			VAL_MAC_CONJ( _yp3, bval[43], _x5 );
			VAL_MAC_CONJ( _yp4, bval[44], _x5 );
			VAL_MAC_CONJ( _yp5, bval[45], _x5 );
			VAL_MAC_CONJ( _yp6, bval[46], _x5 );
			VAL_MAC_CONJ( _yp7, bval[47], _x5 );
			VAL_MAC_CONJ( _yp0, bval[48], _x6 );
			VAL_MAC_CONJ( _yp1, bval[49], _x6 );
			VAL_MAC_CONJ( _yp2, bval[50], _x6 );
			VAL_MAC_CONJ( _yp3, bval[51], _x6 );
			VAL_MAC_CONJ( _yp4, bval[52], _x6 );
			VAL_MAC_CONJ( _yp5, bval[53], _x6 );
			VAL_MAC_CONJ( _yp6, bval[54], _x6 );
			VAL_MAC_CONJ( _yp7, bval[55], _x6 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y1, bval[8], _xp0 );
			VAL_MAC( _y2, bval[16], _xp0 );
			VAL_MAC( _y3, bval[24], _xp0 );
			VAL_MAC( _y4, bval[32], _xp0 );
			VAL_MAC( _y5, bval[40], _xp0 );
			VAL_MAC( _y6, bval[48], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y1, bval[9], _xp1 );
			VAL_MAC( _y2, bval[17], _xp1 );
			VAL_MAC( _y3, bval[25], _xp1 );
			VAL_MAC( _y4, bval[33], _xp1 );
			VAL_MAC( _y5, bval[41], _xp1 );
			VAL_MAC( _y6, bval[49], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y1, bval[10], _xp2 );
			VAL_MAC( _y2, bval[18], _xp2 );
			VAL_MAC( _y3, bval[26], _xp2 );
			VAL_MAC( _y4, bval[34], _xp2 );
			VAL_MAC( _y5, bval[42], _xp2 );
			VAL_MAC( _y6, bval[50], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y1, bval[11], _xp3 );
			VAL_MAC( _y2, bval[19], _xp3 );
			VAL_MAC( _y3, bval[27], _xp3 );
			VAL_MAC( _y4, bval[35], _xp3 );
			VAL_MAC( _y5, bval[43], _xp3 );
			VAL_MAC( _y6, bval[51], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y1, bval[12], _xp4 );
			VAL_MAC( _y2, bval[20], _xp4 );
			VAL_MAC( _y3, bval[28], _xp4 );
			VAL_MAC( _y4, bval[36], _xp4 );
			VAL_MAC( _y5, bval[44], _xp4 );
			VAL_MAC( _y6, bval[52], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y1, bval[13], _xp5 );
			VAL_MAC( _y2, bval[21], _xp5 );
			VAL_MAC( _y3, bval[29], _xp5 );
			VAL_MAC( _y4, bval[37], _xp5 );
			VAL_MAC( _y5, bval[45], _xp5 );
			VAL_MAC( _y6, bval[53], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y1, bval[14], _xp6 );
			VAL_MAC( _y2, bval[22], _xp6 );
			VAL_MAC( _y3, bval[30], _xp6 );
			VAL_MAC( _y4, bval[38], _xp6 );
			VAL_MAC( _y5, bval[46], _xp6 );
			VAL_MAC( _y6, bval[54], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_MAC( _y1, bval[15], _xp7 );
			VAL_MAC( _y2, bval[23], _xp7 );
			VAL_MAC( _y3, bval[31], _xp7 );
			VAL_MAC( _y4, bval[39], _xp7 );
			VAL_MAC( _y5, bval[47], _xp7 );
			VAL_MAC( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7, xp += 7 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1] );
		VAL_ASSIGN( _x2, xp[2] );
		VAL_ASSIGN( _x3, xp[3] );
		VAL_ASSIGN( _x4, xp[4] );
		VAL_ASSIGN( _x5, xp[5] );
		VAL_ASSIGN( _x6, xp[6] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( _y1, bdiag[7], _x0 );
		VAL_MAC( _y2, bdiag[14], _x0 );
		VAL_MAC( _y3, bdiag[21], _x0 );
		VAL_MAC( _y4, bdiag[28], _x0 );
		VAL_MAC( _y5, bdiag[35], _x0 );
		VAL_MAC( _y6, bdiag[42], _x0 );
		VAL_MAC( _y0, bdiag[1], _x1 );
		VAL_MAC( _y1, bdiag[8], _x1 );
		VAL_MAC( _y2, bdiag[15], _x1 );
		VAL_MAC( _y3, bdiag[22], _x1 );
		VAL_MAC( _y4, bdiag[29], _x1 );
		VAL_MAC( _y5, bdiag[36], _x1 );
		VAL_MAC( _y6, bdiag[43], _x1 );
		VAL_MAC( _y0, bdiag[2], _x2 );
		VAL_MAC( _y1, bdiag[9], _x2 );
		VAL_MAC( _y2, bdiag[16], _x2 );
		VAL_MAC( _y3, bdiag[23], _x2 );
		VAL_MAC( _y4, bdiag[30], _x2 );
		VAL_MAC( _y5, bdiag[37], _x2 );
		VAL_MAC( _y6, bdiag[44], _x2 );
		VAL_MAC( _y0, bdiag[3], _x3 );
		VAL_MAC( _y1, bdiag[10], _x3 );
		VAL_MAC( _y2, bdiag[17], _x3 );
		VAL_MAC( _y3, bdiag[24], _x3 );
		VAL_MAC( _y4, bdiag[31], _x3 );
		VAL_MAC( _y5, bdiag[38], _x3 );
		VAL_MAC( _y6, bdiag[45], _x3 );
		VAL_MAC( _y0, bdiag[4], _x4 );
		VAL_MAC( _y1, bdiag[11], _x4 );
		VAL_MAC( _y2, bdiag[18], _x4 );
		VAL_MAC( _y3, bdiag[25], _x4 );
		VAL_MAC( _y4, bdiag[32], _x4 );
		VAL_MAC( _y5, bdiag[39], _x4 );
		VAL_MAC( _y6, bdiag[46], _x4 );
		VAL_MAC( _y0, bdiag[5], _x5 );
		VAL_MAC( _y1, bdiag[12], _x5 );
		VAL_MAC( _y2, bdiag[19], _x5 );
		VAL_MAC( _y3, bdiag[26], _x5 );
		VAL_MAC( _y4, bdiag[33], _x5 );
		VAL_MAC( _y5, bdiag[40], _x5 );
		VAL_MAC( _y6, bdiag[47], _x5 );
		VAL_MAC( _y0, bdiag[6], _x6 );
		VAL_MAC( _y1, bdiag[13], _x6 );
		VAL_MAC( _y2, bdiag[20], _x6 );
		VAL_MAC( _y3, bdiag[27], _x6 );
		VAL_MAC( _y4, bdiag[34], _x6 );
		VAL_MAC( _y5, bdiag[41], _x6 );
		VAL_MAC( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX(). */
#define MBCSR_HermMatMult_v1_aX_b1_xs1_ysX MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatMult_v1_aX_b1_xs1_ysX */
#define MBCSR_HermMatMult_v1_aX_b1_xs1_ysX MANGLE_MOD_(MBCSR_HermMatMult_v1_aX_b1_xs1_ysX_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_HermMatMult_v1_aX_b1_xs1_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7*incy, xp += 7 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1] );
		VAL_MUL( _x2, alpha, xp[2] );
		VAL_MUL( _x3, alpha, xp[3] );
		VAL_MUL( _x4, alpha, xp[4] );
		VAL_MUL( _x5, alpha, xp[5] );
		VAL_MUL( _x6, alpha, xp[6] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _yp0, bval[8], _x1 );
			VAL_MAC_CONJ( _yp1, bval[9], _x1 );
			VAL_MAC_CONJ( _yp2, bval[10], _x1 );
			VAL_MAC_CONJ( _yp3, bval[11], _x1 );
			VAL_MAC_CONJ( _yp4, bval[12], _x1 );
			VAL_MAC_CONJ( _yp5, bval[13], _x1 );
			VAL_MAC_CONJ( _yp6, bval[14], _x1 );
			VAL_MAC_CONJ( _yp7, bval[15], _x1 );
			VAL_MAC_CONJ( _yp0, bval[16], _x2 );
			VAL_MAC_CONJ( _yp1, bval[17], _x2 );
			VAL_MAC_CONJ( _yp2, bval[18], _x2 );
			VAL_MAC_CONJ( _yp3, bval[19], _x2 );
			VAL_MAC_CONJ( _yp4, bval[20], _x2 );
			VAL_MAC_CONJ( _yp5, bval[21], _x2 );
			VAL_MAC_CONJ( _yp6, bval[22], _x2 );
			VAL_MAC_CONJ( _yp7, bval[23], _x2 );
			VAL_MAC_CONJ( _yp0, bval[24], _x3 );
			VAL_MAC_CONJ( _yp1, bval[25], _x3 );
			VAL_MAC_CONJ( _yp2, bval[26], _x3 );
			VAL_MAC_CONJ( _yp3, bval[27], _x3 );
			VAL_MAC_CONJ( _yp4, bval[28], _x3 );
			VAL_MAC_CONJ( _yp5, bval[29], _x3 );
			VAL_MAC_CONJ( _yp6, bval[30], _x3 );
			VAL_MAC_CONJ( _yp7, bval[31], _x3 );
			VAL_MAC_CONJ( _yp0, bval[32], _x4 );
			VAL_MAC_CONJ( _yp1, bval[33], _x4 );
			VAL_MAC_CONJ( _yp2, bval[34], _x4 );
			VAL_MAC_CONJ( _yp3, bval[35], _x4 );
			VAL_MAC_CONJ( _yp4, bval[36], _x4 );
			VAL_MAC_CONJ( _yp5, bval[37], _x4 );
			VAL_MAC_CONJ( _yp6, bval[38], _x4 );
			VAL_MAC_CONJ( _yp7, bval[39], _x4 );
			VAL_MAC_CONJ( _yp0, bval[40], _x5 );
			VAL_MAC_CONJ( _yp1, bval[41], _x5 );
			VAL_MAC_CONJ( _yp2, bval[42], _x5 );
			VAL_MAC_CONJ( _yp3, bval[43], _x5 );
			VAL_MAC_CONJ( _yp4, bval[44], _x5 );
			VAL_MAC_CONJ( _yp5, bval[45], _x5 );
			VAL_MAC_CONJ( _yp6, bval[46], _x5 );
			VAL_MAC_CONJ( _yp7, bval[47], _x5 );
			VAL_MAC_CONJ( _yp0, bval[48], _x6 );
			VAL_MAC_CONJ( _yp1, bval[49], _x6 );
			VAL_MAC_CONJ( _yp2, bval[50], _x6 );
			VAL_MAC_CONJ( _yp3, bval[51], _x6 );
			VAL_MAC_CONJ( _yp4, bval[52], _x6 );
			VAL_MAC_CONJ( _yp5, bval[53], _x6 );
			VAL_MAC_CONJ( _yp6, bval[54], _x6 );
			VAL_MAC_CONJ( _yp7, bval[55], _x6 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y1, bval[8], _xp0 );
			VAL_MAC( _y2, bval[16], _xp0 );
			VAL_MAC( _y3, bval[24], _xp0 );
			VAL_MAC( _y4, bval[32], _xp0 );
			VAL_MAC( _y5, bval[40], _xp0 );
			VAL_MAC( _y6, bval[48], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y1, bval[9], _xp1 );
			VAL_MAC( _y2, bval[17], _xp1 );
			VAL_MAC( _y3, bval[25], _xp1 );
			VAL_MAC( _y4, bval[33], _xp1 );
			VAL_MAC( _y5, bval[41], _xp1 );
			VAL_MAC( _y6, bval[49], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y1, bval[10], _xp2 );
			VAL_MAC( _y2, bval[18], _xp2 );
			VAL_MAC( _y3, bval[26], _xp2 );
			VAL_MAC( _y4, bval[34], _xp2 );
			VAL_MAC( _y5, bval[42], _xp2 );
			VAL_MAC( _y6, bval[50], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y1, bval[11], _xp3 );
			VAL_MAC( _y2, bval[19], _xp3 );
			VAL_MAC( _y3, bval[27], _xp3 );
			VAL_MAC( _y4, bval[35], _xp3 );
			VAL_MAC( _y5, bval[43], _xp3 );
			VAL_MAC( _y6, bval[51], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y1, bval[12], _xp4 );
			VAL_MAC( _y2, bval[20], _xp4 );
			VAL_MAC( _y3, bval[28], _xp4 );
			VAL_MAC( _y4, bval[36], _xp4 );
			VAL_MAC( _y5, bval[44], _xp4 );
			VAL_MAC( _y6, bval[52], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y1, bval[13], _xp5 );
			VAL_MAC( _y2, bval[21], _xp5 );
			VAL_MAC( _y3, bval[29], _xp5 );
			VAL_MAC( _y4, bval[37], _xp5 );
			VAL_MAC( _y5, bval[45], _xp5 );
			VAL_MAC( _y6, bval[53], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y1, bval[14], _xp6 );
			VAL_MAC( _y2, bval[22], _xp6 );
			VAL_MAC( _y3, bval[30], _xp6 );
			VAL_MAC( _y4, bval[38], _xp6 );
			VAL_MAC( _y5, bval[46], _xp6 );
			VAL_MAC( _y6, bval[54], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_MAC( _y1, bval[15], _xp7 );
			VAL_MAC( _y2, bval[23], _xp7 );
			VAL_MAC( _y3, bval[31], _xp7 );
			VAL_MAC( _y4, bval[39], _xp7 );
			VAL_MAC( _y5, bval[47], _xp7 );
			VAL_MAC( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7*incy, xp += 7 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1] );
		VAL_ASSIGN( _x2, xp[2] );
		VAL_ASSIGN( _x3, xp[3] );
		VAL_ASSIGN( _x4, xp[4] );
		VAL_ASSIGN( _x5, xp[5] );
		VAL_ASSIGN( _x6, xp[6] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( _y1, bdiag[7], _x0 );
		VAL_MAC( _y2, bdiag[14], _x0 );
		VAL_MAC( _y3, bdiag[21], _x0 );
		VAL_MAC( _y4, bdiag[28], _x0 );
		VAL_MAC( _y5, bdiag[35], _x0 );
		VAL_MAC( _y6, bdiag[42], _x0 );
		VAL_MAC( _y0, bdiag[1], _x1 );
		VAL_MAC( _y1, bdiag[8], _x1 );
		VAL_MAC( _y2, bdiag[15], _x1 );
		VAL_MAC( _y3, bdiag[22], _x1 );
		VAL_MAC( _y4, bdiag[29], _x1 );
		VAL_MAC( _y5, bdiag[36], _x1 );
		VAL_MAC( _y6, bdiag[43], _x1 );
		VAL_MAC( _y0, bdiag[2], _x2 );
		VAL_MAC( _y1, bdiag[9], _x2 );
		VAL_MAC( _y2, bdiag[16], _x2 );
		VAL_MAC( _y3, bdiag[23], _x2 );
		VAL_MAC( _y4, bdiag[30], _x2 );
		VAL_MAC( _y5, bdiag[37], _x2 );
		VAL_MAC( _y6, bdiag[44], _x2 );
		VAL_MAC( _y0, bdiag[3], _x3 );
		VAL_MAC( _y1, bdiag[10], _x3 );
		VAL_MAC( _y2, bdiag[17], _x3 );
		VAL_MAC( _y3, bdiag[24], _x3 );
		VAL_MAC( _y4, bdiag[31], _x3 );
		VAL_MAC( _y5, bdiag[38], _x3 );
		VAL_MAC( _y6, bdiag[45], _x3 );
		VAL_MAC( _y0, bdiag[4], _x4 );
		VAL_MAC( _y1, bdiag[11], _x4 );
		VAL_MAC( _y2, bdiag[18], _x4 );
		VAL_MAC( _y3, bdiag[25], _x4 );
		VAL_MAC( _y4, bdiag[32], _x4 );
		VAL_MAC( _y5, bdiag[39], _x4 );
		VAL_MAC( _y6, bdiag[46], _x4 );
		VAL_MAC( _y0, bdiag[5], _x5 );
		VAL_MAC( _y1, bdiag[12], _x5 );
		VAL_MAC( _y2, bdiag[19], _x5 );
		VAL_MAC( _y3, bdiag[26], _x5 );
		VAL_MAC( _y4, bdiag[33], _x5 );
		VAL_MAC( _y5, bdiag[40], _x5 );
		VAL_MAC( _y6, bdiag[47], _x5 );
		VAL_MAC( _y0, bdiag[6], _x6 );
		VAL_MAC( _y1, bdiag[13], _x6 );
		VAL_MAC( _y2, bdiag[20], _x6 );
		VAL_MAC( _y3, bdiag[27], _x6 );
		VAL_MAC( _y4, bdiag[34], _x6 );
		VAL_MAC( _y5, bdiag[41], _x6 );
		VAL_MAC( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1(). */
#define MBCSR_HermMatMult_v1_aX_b1_xsX_ys1 MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatMult_v1_aX_b1_xsX_ys1 */
#define MBCSR_HermMatMult_v1_aX_b1_xsX_ys1 MANGLE_MOD_(MBCSR_HermMatMult_v1_aX_b1_xsX_ys1_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_HermMatMult_v1_aX_b1_xsX_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7, xp += 7*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1*incx] );
		VAL_MUL( _x2, alpha, xp[2*incx] );
		VAL_MUL( _x3, alpha, xp[3*incx] );
		VAL_MUL( _x4, alpha, xp[4*incx] );
		VAL_MUL( _x5, alpha, xp[5*incx] );
		VAL_MUL( _x6, alpha, xp[6*incx] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _yp0, bval[8], _x1 );
			VAL_MAC_CONJ( _yp1, bval[9], _x1 );
			VAL_MAC_CONJ( _yp2, bval[10], _x1 );
			VAL_MAC_CONJ( _yp3, bval[11], _x1 );
			VAL_MAC_CONJ( _yp4, bval[12], _x1 );
			VAL_MAC_CONJ( _yp5, bval[13], _x1 );
			VAL_MAC_CONJ( _yp6, bval[14], _x1 );
			VAL_MAC_CONJ( _yp7, bval[15], _x1 );
			VAL_MAC_CONJ( _yp0, bval[16], _x2 );
			VAL_MAC_CONJ( _yp1, bval[17], _x2 );
			VAL_MAC_CONJ( _yp2, bval[18], _x2 );
			VAL_MAC_CONJ( _yp3, bval[19], _x2 );
			VAL_MAC_CONJ( _yp4, bval[20], _x2 );
			VAL_MAC_CONJ( _yp5, bval[21], _x2 );
			VAL_MAC_CONJ( _yp6, bval[22], _x2 );
			VAL_MAC_CONJ( _yp7, bval[23], _x2 );
			VAL_MAC_CONJ( _yp0, bval[24], _x3 );
			VAL_MAC_CONJ( _yp1, bval[25], _x3 );
			VAL_MAC_CONJ( _yp2, bval[26], _x3 );
			VAL_MAC_CONJ( _yp3, bval[27], _x3 );
			VAL_MAC_CONJ( _yp4, bval[28], _x3 );
			VAL_MAC_CONJ( _yp5, bval[29], _x3 );
			VAL_MAC_CONJ( _yp6, bval[30], _x3 );
			VAL_MAC_CONJ( _yp7, bval[31], _x3 );
			VAL_MAC_CONJ( _yp0, bval[32], _x4 );
			VAL_MAC_CONJ( _yp1, bval[33], _x4 );
			VAL_MAC_CONJ( _yp2, bval[34], _x4 );
			VAL_MAC_CONJ( _yp3, bval[35], _x4 );
			VAL_MAC_CONJ( _yp4, bval[36], _x4 );
			VAL_MAC_CONJ( _yp5, bval[37], _x4 );
			VAL_MAC_CONJ( _yp6, bval[38], _x4 );
			VAL_MAC_CONJ( _yp7, bval[39], _x4 );
			VAL_MAC_CONJ( _yp0, bval[40], _x5 );
			VAL_MAC_CONJ( _yp1, bval[41], _x5 );
			VAL_MAC_CONJ( _yp2, bval[42], _x5 );
			VAL_MAC_CONJ( _yp3, bval[43], _x5 );
			VAL_MAC_CONJ( _yp4, bval[44], _x5 );
			VAL_MAC_CONJ( _yp5, bval[45], _x5 );
			VAL_MAC_CONJ( _yp6, bval[46], _x5 );
			VAL_MAC_CONJ( _yp7, bval[47], _x5 );
			VAL_MAC_CONJ( _yp0, bval[48], _x6 );
			VAL_MAC_CONJ( _yp1, bval[49], _x6 );
			VAL_MAC_CONJ( _yp2, bval[50], _x6 );
			VAL_MAC_CONJ( _yp3, bval[51], _x6 );
			VAL_MAC_CONJ( _yp4, bval[52], _x6 );
			VAL_MAC_CONJ( _yp5, bval[53], _x6 );
			VAL_MAC_CONJ( _yp6, bval[54], _x6 );
			VAL_MAC_CONJ( _yp7, bval[55], _x6 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y1, bval[8], _xp0 );
			VAL_MAC( _y2, bval[16], _xp0 );
			VAL_MAC( _y3, bval[24], _xp0 );
			VAL_MAC( _y4, bval[32], _xp0 );
			VAL_MAC( _y5, bval[40], _xp0 );
			VAL_MAC( _y6, bval[48], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y1, bval[9], _xp1 );
			VAL_MAC( _y2, bval[17], _xp1 );
			VAL_MAC( _y3, bval[25], _xp1 );
			VAL_MAC( _y4, bval[33], _xp1 );
			VAL_MAC( _y5, bval[41], _xp1 );
			VAL_MAC( _y6, bval[49], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y1, bval[10], _xp2 );
			VAL_MAC( _y2, bval[18], _xp2 );
			VAL_MAC( _y3, bval[26], _xp2 );
			VAL_MAC( _y4, bval[34], _xp2 );
			VAL_MAC( _y5, bval[42], _xp2 );
			VAL_MAC( _y6, bval[50], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y1, bval[11], _xp3 );
			VAL_MAC( _y2, bval[19], _xp3 );
			VAL_MAC( _y3, bval[27], _xp3 );
			VAL_MAC( _y4, bval[35], _xp3 );
			VAL_MAC( _y5, bval[43], _xp3 );
			VAL_MAC( _y6, bval[51], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y1, bval[12], _xp4 );
			VAL_MAC( _y2, bval[20], _xp4 );
			VAL_MAC( _y3, bval[28], _xp4 );
			VAL_MAC( _y4, bval[36], _xp4 );
			VAL_MAC( _y5, bval[44], _xp4 );
			VAL_MAC( _y6, bval[52], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y1, bval[13], _xp5 );
			VAL_MAC( _y2, bval[21], _xp5 );
			VAL_MAC( _y3, bval[29], _xp5 );
			VAL_MAC( _y4, bval[37], _xp5 );
			VAL_MAC( _y5, bval[45], _xp5 );
			VAL_MAC( _y6, bval[53], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y1, bval[14], _xp6 );
			VAL_MAC( _y2, bval[22], _xp6 );
			VAL_MAC( _y3, bval[30], _xp6 );
			VAL_MAC( _y4, bval[38], _xp6 );
			VAL_MAC( _y5, bval[46], _xp6 );
			VAL_MAC( _y6, bval[54], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_MAC( _y1, bval[15], _xp7 );
			VAL_MAC( _y2, bval[23], _xp7 );
			VAL_MAC( _y3, bval[31], _xp7 );
			VAL_MAC( _y4, bval[39], _xp7 );
			VAL_MAC( _y5, bval[47], _xp7 );
			VAL_MAC( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7, xp += 7*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1*incx] );
		VAL_ASSIGN( _x2, xp[2*incx] );
		VAL_ASSIGN( _x3, xp[3*incx] );
		VAL_ASSIGN( _x4, xp[4*incx] );
		VAL_ASSIGN( _x5, xp[5*incx] );
		VAL_ASSIGN( _x6, xp[6*incx] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( _y1, bdiag[7], _x0 );
		VAL_MAC( _y2, bdiag[14], _x0 );
		VAL_MAC( _y3, bdiag[21], _x0 );
		VAL_MAC( _y4, bdiag[28], _x0 );
		VAL_MAC( _y5, bdiag[35], _x0 );
		VAL_MAC( _y6, bdiag[42], _x0 );
		VAL_MAC( _y0, bdiag[1], _x1 );
		VAL_MAC( _y1, bdiag[8], _x1 );
		VAL_MAC( _y2, bdiag[15], _x1 );
		VAL_MAC( _y3, bdiag[22], _x1 );
		VAL_MAC( _y4, bdiag[29], _x1 );
		VAL_MAC( _y5, bdiag[36], _x1 );
		VAL_MAC( _y6, bdiag[43], _x1 );
		VAL_MAC( _y0, bdiag[2], _x2 );
		VAL_MAC( _y1, bdiag[9], _x2 );
		VAL_MAC( _y2, bdiag[16], _x2 );
		VAL_MAC( _y3, bdiag[23], _x2 );
		VAL_MAC( _y4, bdiag[30], _x2 );
		VAL_MAC( _y5, bdiag[37], _x2 );
		VAL_MAC( _y6, bdiag[44], _x2 );
		VAL_MAC( _y0, bdiag[3], _x3 );
		VAL_MAC( _y1, bdiag[10], _x3 );
		VAL_MAC( _y2, bdiag[17], _x3 );
		VAL_MAC( _y3, bdiag[24], _x3 );
		VAL_MAC( _y4, bdiag[31], _x3 );
		VAL_MAC( _y5, bdiag[38], _x3 );
		VAL_MAC( _y6, bdiag[45], _x3 );
		VAL_MAC( _y0, bdiag[4], _x4 );
		VAL_MAC( _y1, bdiag[11], _x4 );
		VAL_MAC( _y2, bdiag[18], _x4 );
		VAL_MAC( _y3, bdiag[25], _x4 );
		VAL_MAC( _y4, bdiag[32], _x4 );
		VAL_MAC( _y5, bdiag[39], _x4 );
		VAL_MAC( _y6, bdiag[46], _x4 );
		VAL_MAC( _y0, bdiag[5], _x5 );
		VAL_MAC( _y1, bdiag[12], _x5 );
		VAL_MAC( _y2, bdiag[19], _x5 );
		VAL_MAC( _y3, bdiag[26], _x5 );
		VAL_MAC( _y4, bdiag[33], _x5 );
		VAL_MAC( _y5, bdiag[40], _x5 );
		VAL_MAC( _y6, bdiag[47], _x5 );
		VAL_MAC( _y0, bdiag[6], _x6 );
		VAL_MAC( _y1, bdiag[13], _x6 );
		VAL_MAC( _y2, bdiag[20], _x6 );
		VAL_MAC( _y3, bdiag[27], _x6 );
		VAL_MAC( _y4, bdiag[34], _x6 );
		VAL_MAC( _y5, bdiag[41], _x6 );
		VAL_MAC( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX(). */
#define MBCSR_HermMatMult_v1_aX_b1_xsX_ysX MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatMult_v1_aX_b1_xsX_ysX */
#define MBCSR_HermMatMult_v1_aX_b1_xsX_ysX MANGLE_MOD_(MBCSR_HermMatMult_v1_aX_b1_xsX_ysX_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_HermMatMult_v1_aX_b1_xsX_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7*incy, xp += 7*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1*incx] );
		VAL_MUL( _x2, alpha, xp[2*incx] );
		VAL_MUL( _x3, alpha, xp[3*incx] );
		VAL_MUL( _x4, alpha, xp[4*incx] );
		VAL_MUL( _x5, alpha, xp[5*incx] );
		VAL_MUL( _x6, alpha, xp[6*incx] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _yp0, bval[8], _x1 );
			VAL_MAC_CONJ( _yp1, bval[9], _x1 );
			VAL_MAC_CONJ( _yp2, bval[10], _x1 );
			VAL_MAC_CONJ( _yp3, bval[11], _x1 );
			VAL_MAC_CONJ( _yp4, bval[12], _x1 );
			VAL_MAC_CONJ( _yp5, bval[13], _x1 );
			VAL_MAC_CONJ( _yp6, bval[14], _x1 );
			VAL_MAC_CONJ( _yp7, bval[15], _x1 );
			VAL_MAC_CONJ( _yp0, bval[16], _x2 );
			VAL_MAC_CONJ( _yp1, bval[17], _x2 );
			VAL_MAC_CONJ( _yp2, bval[18], _x2 );
			VAL_MAC_CONJ( _yp3, bval[19], _x2 );
			VAL_MAC_CONJ( _yp4, bval[20], _x2 );
			VAL_MAC_CONJ( _yp5, bval[21], _x2 );
			VAL_MAC_CONJ( _yp6, bval[22], _x2 );
			VAL_MAC_CONJ( _yp7, bval[23], _x2 );
			VAL_MAC_CONJ( _yp0, bval[24], _x3 );
			VAL_MAC_CONJ( _yp1, bval[25], _x3 );
			VAL_MAC_CONJ( _yp2, bval[26], _x3 );
			VAL_MAC_CONJ( _yp3, bval[27], _x3 );
			VAL_MAC_CONJ( _yp4, bval[28], _x3 );
			VAL_MAC_CONJ( _yp5, bval[29], _x3 );
			VAL_MAC_CONJ( _yp6, bval[30], _x3 );
			VAL_MAC_CONJ( _yp7, bval[31], _x3 );
			VAL_MAC_CONJ( _yp0, bval[32], _x4 );
			VAL_MAC_CONJ( _yp1, bval[33], _x4 );
			VAL_MAC_CONJ( _yp2, bval[34], _x4 );
			VAL_MAC_CONJ( _yp3, bval[35], _x4 );
			VAL_MAC_CONJ( _yp4, bval[36], _x4 );
			VAL_MAC_CONJ( _yp5, bval[37], _x4 );
			VAL_MAC_CONJ( _yp6, bval[38], _x4 );
			VAL_MAC_CONJ( _yp7, bval[39], _x4 );
			VAL_MAC_CONJ( _yp0, bval[40], _x5 );
			VAL_MAC_CONJ( _yp1, bval[41], _x5 );
			VAL_MAC_CONJ( _yp2, bval[42], _x5 );
			VAL_MAC_CONJ( _yp3, bval[43], _x5 );
			VAL_MAC_CONJ( _yp4, bval[44], _x5 );
			VAL_MAC_CONJ( _yp5, bval[45], _x5 );
			VAL_MAC_CONJ( _yp6, bval[46], _x5 );
			VAL_MAC_CONJ( _yp7, bval[47], _x5 );
			VAL_MAC_CONJ( _yp0, bval[48], _x6 );
			VAL_MAC_CONJ( _yp1, bval[49], _x6 );
			VAL_MAC_CONJ( _yp2, bval[50], _x6 );
			VAL_MAC_CONJ( _yp3, bval[51], _x6 );
			VAL_MAC_CONJ( _yp4, bval[52], _x6 );
			VAL_MAC_CONJ( _yp5, bval[53], _x6 );
			VAL_MAC_CONJ( _yp6, bval[54], _x6 );
			VAL_MAC_CONJ( _yp7, bval[55], _x6 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y1, bval[8], _xp0 );
			VAL_MAC( _y2, bval[16], _xp0 );
			VAL_MAC( _y3, bval[24], _xp0 );
			VAL_MAC( _y4, bval[32], _xp0 );
			VAL_MAC( _y5, bval[40], _xp0 );
			VAL_MAC( _y6, bval[48], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y1, bval[9], _xp1 );
			VAL_MAC( _y2, bval[17], _xp1 );
			VAL_MAC( _y3, bval[25], _xp1 );
			VAL_MAC( _y4, bval[33], _xp1 );
			VAL_MAC( _y5, bval[41], _xp1 );
			VAL_MAC( _y6, bval[49], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y1, bval[10], _xp2 );
			VAL_MAC( _y2, bval[18], _xp2 );
			VAL_MAC( _y3, bval[26], _xp2 );
			VAL_MAC( _y4, bval[34], _xp2 );
			VAL_MAC( _y5, bval[42], _xp2 );
			VAL_MAC( _y6, bval[50], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y1, bval[11], _xp3 );
			VAL_MAC( _y2, bval[19], _xp3 );
			VAL_MAC( _y3, bval[27], _xp3 );
			VAL_MAC( _y4, bval[35], _xp3 );
			VAL_MAC( _y5, bval[43], _xp3 );
			VAL_MAC( _y6, bval[51], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y1, bval[12], _xp4 );
			VAL_MAC( _y2, bval[20], _xp4 );
			VAL_MAC( _y3, bval[28], _xp4 );
			VAL_MAC( _y4, bval[36], _xp4 );
			VAL_MAC( _y5, bval[44], _xp4 );
			VAL_MAC( _y6, bval[52], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y1, bval[13], _xp5 );
			VAL_MAC( _y2, bval[21], _xp5 );
			VAL_MAC( _y3, bval[29], _xp5 );
			VAL_MAC( _y4, bval[37], _xp5 );
			VAL_MAC( _y5, bval[45], _xp5 );
			VAL_MAC( _y6, bval[53], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y1, bval[14], _xp6 );
			VAL_MAC( _y2, bval[22], _xp6 );
			VAL_MAC( _y3, bval[30], _xp6 );
			VAL_MAC( _y4, bval[38], _xp6 );
			VAL_MAC( _y5, bval[46], _xp6 );
			VAL_MAC( _y6, bval[54], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_MAC( _y1, bval[15], _xp7 );
			VAL_MAC( _y2, bval[23], _xp7 );
			VAL_MAC( _y3, bval[31], _xp7 );
			VAL_MAC( _y4, bval[39], _xp7 );
			VAL_MAC( _y5, bval[47], _xp7 );
			VAL_MAC( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7*incy, xp += 7*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1*incx] );
		VAL_ASSIGN( _x2, xp[2*incx] );
		VAL_ASSIGN( _x3, xp[3*incx] );
		VAL_ASSIGN( _x4, xp[4*incx] );
		VAL_ASSIGN( _x5, xp[5*incx] );
		VAL_ASSIGN( _x6, xp[6*incx] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( _y1, bdiag[7], _x0 );
		VAL_MAC( _y2, bdiag[14], _x0 );
		VAL_MAC( _y3, bdiag[21], _x0 );
		VAL_MAC( _y4, bdiag[28], _x0 );
		VAL_MAC( _y5, bdiag[35], _x0 );
		VAL_MAC( _y6, bdiag[42], _x0 );
		VAL_MAC( _y0, bdiag[1], _x1 );
		VAL_MAC( _y1, bdiag[8], _x1 );
		VAL_MAC( _y2, bdiag[15], _x1 );
		VAL_MAC( _y3, bdiag[22], _x1 );
		VAL_MAC( _y4, bdiag[29], _x1 );
		VAL_MAC( _y5, bdiag[36], _x1 );
		VAL_MAC( _y6, bdiag[43], _x1 );
		VAL_MAC( _y0, bdiag[2], _x2 );
		VAL_MAC( _y1, bdiag[9], _x2 );
		VAL_MAC( _y2, bdiag[16], _x2 );
		VAL_MAC( _y3, bdiag[23], _x2 );
		VAL_MAC( _y4, bdiag[30], _x2 );
		VAL_MAC( _y5, bdiag[37], _x2 );
		VAL_MAC( _y6, bdiag[44], _x2 );
		VAL_MAC( _y0, bdiag[3], _x3 );
		VAL_MAC( _y1, bdiag[10], _x3 );
		VAL_MAC( _y2, bdiag[17], _x3 );
		VAL_MAC( _y3, bdiag[24], _x3 );
		VAL_MAC( _y4, bdiag[31], _x3 );
		VAL_MAC( _y5, bdiag[38], _x3 );
		VAL_MAC( _y6, bdiag[45], _x3 );
		VAL_MAC( _y0, bdiag[4], _x4 );
		VAL_MAC( _y1, bdiag[11], _x4 );
		VAL_MAC( _y2, bdiag[18], _x4 );
		VAL_MAC( _y3, bdiag[25], _x4 );
		VAL_MAC( _y4, bdiag[32], _x4 );
		VAL_MAC( _y5, bdiag[39], _x4 );
		VAL_MAC( _y6, bdiag[46], _x4 );
		VAL_MAC( _y0, bdiag[5], _x5 );
		VAL_MAC( _y1, bdiag[12], _x5 );
		VAL_MAC( _y2, bdiag[19], _x5 );
		VAL_MAC( _y3, bdiag[26], _x5 );
		VAL_MAC( _y4, bdiag[33], _x5 );
		VAL_MAC( _y5, bdiag[40], _x5 );
		VAL_MAC( _y6, bdiag[47], _x5 );
		VAL_MAC( _y0, bdiag[6], _x6 );
		VAL_MAC( _y1, bdiag[13], _x6 );
		VAL_MAC( _y2, bdiag[20], _x6 );
		VAL_MAC( _y3, bdiag[27], _x6 );
		VAL_MAC( _y4, bdiag[34], _x6 );
		VAL_MAC( _y5, bdiag[41], _x6 );
		VAL_MAC( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX
/**
 *  \brief Matrix times single-vector multiply in the normal case;
 *  see SymmMatMult_v1().
 */
#define HermMatMult_v1 SymmMatMult_v1

#else /* IS_VAL_COMPLEX */


/**
 *  \brief Hermitian matrix times single-vector multiply in the normal case.
 */
static void
HermMatMult_v1( oski_index_t M, oski_index_t d0,
	const oski_index_t* bptr, const oski_index_t* bind,
	const oski_value_t* bval, const oski_value_t* bdiag,
	oski_value_t alpha, const oski_value_t* x, oski_index_t incx,
	oski_value_t* y, oski_index_t incy )
{
	if( incx == 1 ) {
		if( incy == 1 ) {
			MBCSR_HermMatMult_v1_aX_b1_xs1_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y );
		} else { /* general incy */
			MBCSR_HermMatMult_v1_aX_b1_xs1_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y, incy );
		}
	} else { /* general incx */
		if( incy == 1 ) {
			MBCSR_HermMatMult_v1_aX_b1_xsX_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y );
		} else { /* general incy */
			MBCSR_HermMatMult_v1_aX_b1_xsX_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y, incy );
		}
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX
/**
 *  \brief See SymmMatMult().
 */
#define HermMatMult SymmMatMult

#else /* IS_VAL_COMPLEX */



/**
 *  \brief Computes
 *  \f$y \leftarrow y + \alpha\cdot\mathrm{op}(A)\cdot x\f$,
 *  where \f$\mathrm{op}(A) = A\f$, on the fully blocked
 *  portion of \f$A\f$.
 */
static int
HermMatMult( const oski_submatMBCSR_t* A,
	oski_value_t alpha, const oski_vecview_t x_view,
	oski_vecview_t y_view )
{
	oski_index_t j; /* column number */
	const oski_value_t* xpj; /* X(:, j) */
	oski_value_t* ypj; /* Y(:, j) */

	assert( A->r == 7 );
	assert( A->c == 8 );

	for( j = 0, xpj = x_view->val, ypj = y_view->val;
		j < x_view->num_cols;
		j++, xpj += x_view->colinc, ypj += y_view->colinc )
	{
		HermMatMult_v1( A->num_block_rows, A->offset,
			A->bptr, A->bind, A->bval, A->bdiag,
			alpha, xpj, x_view->rowinc, ypj, y_view->rowinc );
	}

	return 0;
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1(). */
#define MBCSR_HermMatConjMult_v1_aX_b1_xs1_ys1 MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatConjMult_v1_aX_b1_xs1_ys1 */
#define MBCSR_HermMatConjMult_v1_aX_b1_xs1_ys1 MANGLE_MOD_(MBCSR_HermMatConjMult_v1_aX_b1_xs1_ys1_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_HermMatConjMult_v1_aX_b1_xs1_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7, xp += 7 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1] );
		VAL_MUL( _x2, alpha, xp[2] );
		VAL_MUL( _x3, alpha, xp[3] );
		VAL_MUL( _x4, alpha, xp[4] );
		VAL_MUL( _x5, alpha, xp[5] );
		VAL_MUL( _x6, alpha, xp[6] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC( _yp0, bval[8], _x1 );
			VAL_MAC( _yp1, bval[9], _x1 );
			VAL_MAC( _yp2, bval[10], _x1 );
			VAL_MAC( _yp3, bval[11], _x1 );
			VAL_MAC( _yp4, bval[12], _x1 );
			VAL_MAC( _yp5, bval[13], _x1 );
			VAL_MAC( _yp6, bval[14], _x1 );
			VAL_MAC( _yp7, bval[15], _x1 );
			VAL_MAC( _yp0, bval[16], _x2 );
			VAL_MAC( _yp1, bval[17], _x2 );
			VAL_MAC( _yp2, bval[18], _x2 );
			VAL_MAC( _yp3, bval[19], _x2 );
			VAL_MAC( _yp4, bval[20], _x2 );
			VAL_MAC( _yp5, bval[21], _x2 );
			VAL_MAC( _yp6, bval[22], _x2 );
			VAL_MAC( _yp7, bval[23], _x2 );
			VAL_MAC( _yp0, bval[24], _x3 );
			VAL_MAC( _yp1, bval[25], _x3 );
			VAL_MAC( _yp2, bval[26], _x3 );
			VAL_MAC( _yp3, bval[27], _x3 );
			VAL_MAC( _yp4, bval[28], _x3 );
			VAL_MAC( _yp5, bval[29], _x3 );
			VAL_MAC( _yp6, bval[30], _x3 );
			VAL_MAC( _yp7, bval[31], _x3 );
			VAL_MAC( _yp0, bval[32], _x4 );
			VAL_MAC( _yp1, bval[33], _x4 );
			VAL_MAC( _yp2, bval[34], _x4 );
			VAL_MAC( _yp3, bval[35], _x4 );
			VAL_MAC( _yp4, bval[36], _x4 );
			VAL_MAC( _yp5, bval[37], _x4 );
			VAL_MAC( _yp6, bval[38], _x4 );
			VAL_MAC( _yp7, bval[39], _x4 );
			VAL_MAC( _yp0, bval[40], _x5 );
			VAL_MAC( _yp1, bval[41], _x5 );
			VAL_MAC( _yp2, bval[42], _x5 );
			VAL_MAC( _yp3, bval[43], _x5 );
			VAL_MAC( _yp4, bval[44], _x5 );
			VAL_MAC( _yp5, bval[45], _x5 );
			VAL_MAC( _yp6, bval[46], _x5 );
			VAL_MAC( _yp7, bval[47], _x5 );
			VAL_MAC( _yp0, bval[48], _x6 );
			VAL_MAC( _yp1, bval[49], _x6 );
			VAL_MAC( _yp2, bval[50], _x6 );
			VAL_MAC( _yp3, bval[51], _x6 );
			VAL_MAC( _yp4, bval[52], _x6 );
			VAL_MAC( _yp5, bval[53], _x6 );
			VAL_MAC( _yp6, bval[54], _x6 );
			VAL_MAC( _yp7, bval[55], _x6 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y1, bval[8], _xp0 );
			VAL_MAC_CONJ( _y2, bval[16], _xp0 );
			VAL_MAC_CONJ( _y3, bval[24], _xp0 );
			VAL_MAC_CONJ( _y4, bval[32], _xp0 );
			VAL_MAC_CONJ( _y5, bval[40], _xp0 );
			VAL_MAC_CONJ( _y6, bval[48], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y1, bval[9], _xp1 );
			VAL_MAC_CONJ( _y2, bval[17], _xp1 );
			VAL_MAC_CONJ( _y3, bval[25], _xp1 );
			VAL_MAC_CONJ( _y4, bval[33], _xp1 );
			VAL_MAC_CONJ( _y5, bval[41], _xp1 );
			VAL_MAC_CONJ( _y6, bval[49], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y1, bval[10], _xp2 );
			VAL_MAC_CONJ( _y2, bval[18], _xp2 );
			VAL_MAC_CONJ( _y3, bval[26], _xp2 );
			VAL_MAC_CONJ( _y4, bval[34], _xp2 );
			VAL_MAC_CONJ( _y5, bval[42], _xp2 );
			VAL_MAC_CONJ( _y6, bval[50], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y1, bval[11], _xp3 );
			VAL_MAC_CONJ( _y2, bval[19], _xp3 );
			VAL_MAC_CONJ( _y3, bval[27], _xp3 );
			VAL_MAC_CONJ( _y4, bval[35], _xp3 );
			VAL_MAC_CONJ( _y5, bval[43], _xp3 );
			VAL_MAC_CONJ( _y6, bval[51], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y1, bval[12], _xp4 );
			VAL_MAC_CONJ( _y2, bval[20], _xp4 );
			VAL_MAC_CONJ( _y3, bval[28], _xp4 );
			VAL_MAC_CONJ( _y4, bval[36], _xp4 );
			VAL_MAC_CONJ( _y5, bval[44], _xp4 );
			VAL_MAC_CONJ( _y6, bval[52], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y1, bval[13], _xp5 );
			VAL_MAC_CONJ( _y2, bval[21], _xp5 );
			VAL_MAC_CONJ( _y3, bval[29], _xp5 );
			VAL_MAC_CONJ( _y4, bval[37], _xp5 );
			VAL_MAC_CONJ( _y5, bval[45], _xp5 );
			VAL_MAC_CONJ( _y6, bval[53], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y1, bval[14], _xp6 );
			VAL_MAC_CONJ( _y2, bval[22], _xp6 );
			VAL_MAC_CONJ( _y3, bval[30], _xp6 );
			VAL_MAC_CONJ( _y4, bval[38], _xp6 );
			VAL_MAC_CONJ( _y5, bval[46], _xp6 );
			VAL_MAC_CONJ( _y6, bval[54], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_MAC_CONJ( _y1, bval[15], _xp7 );
			VAL_MAC_CONJ( _y2, bval[23], _xp7 );
			VAL_MAC_CONJ( _y3, bval[31], _xp7 );
			VAL_MAC_CONJ( _y4, bval[39], _xp7 );
			VAL_MAC_CONJ( _y5, bval[47], _xp7 );
			VAL_MAC_CONJ( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7, xp += 7 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1] );
		VAL_ASSIGN( _x2, xp[2] );
		VAL_ASSIGN( _x3, xp[3] );
		VAL_ASSIGN( _x4, xp[4] );
		VAL_ASSIGN( _x5, xp[5] );
		VAL_ASSIGN( _x6, xp[6] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC_CONJ( _y1, bdiag[7], _x0 );
		VAL_MAC_CONJ( _y2, bdiag[14], _x0 );
		VAL_MAC_CONJ( _y3, bdiag[21], _x0 );
		VAL_MAC_CONJ( _y4, bdiag[28], _x0 );
		VAL_MAC_CONJ( _y5, bdiag[35], _x0 );
		VAL_MAC_CONJ( _y6, bdiag[42], _x0 );
		VAL_MAC_CONJ( _y0, bdiag[1], _x1 );
		VAL_MAC_CONJ( _y1, bdiag[8], _x1 );
		VAL_MAC_CONJ( _y2, bdiag[15], _x1 );
		VAL_MAC_CONJ( _y3, bdiag[22], _x1 );
		VAL_MAC_CONJ( _y4, bdiag[29], _x1 );
		VAL_MAC_CONJ( _y5, bdiag[36], _x1 );
		VAL_MAC_CONJ( _y6, bdiag[43], _x1 );
		VAL_MAC_CONJ( _y0, bdiag[2], _x2 );
		VAL_MAC_CONJ( _y1, bdiag[9], _x2 );
		VAL_MAC_CONJ( _y2, bdiag[16], _x2 );
		VAL_MAC_CONJ( _y3, bdiag[23], _x2 );
		VAL_MAC_CONJ( _y4, bdiag[30], _x2 );
		VAL_MAC_CONJ( _y5, bdiag[37], _x2 );
		VAL_MAC_CONJ( _y6, bdiag[44], _x2 );
		VAL_MAC_CONJ( _y0, bdiag[3], _x3 );
		VAL_MAC_CONJ( _y1, bdiag[10], _x3 );
		VAL_MAC_CONJ( _y2, bdiag[17], _x3 );
		VAL_MAC_CONJ( _y3, bdiag[24], _x3 );
		VAL_MAC_CONJ( _y4, bdiag[31], _x3 );
		VAL_MAC_CONJ( _y5, bdiag[38], _x3 );
		VAL_MAC_CONJ( _y6, bdiag[45], _x3 );
		VAL_MAC_CONJ( _y0, bdiag[4], _x4 );
		VAL_MAC_CONJ( _y1, bdiag[11], _x4 );
		VAL_MAC_CONJ( _y2, bdiag[18], _x4 );
		VAL_MAC_CONJ( _y3, bdiag[25], _x4 );
		VAL_MAC_CONJ( _y4, bdiag[32], _x4 );
		VAL_MAC_CONJ( _y5, bdiag[39], _x4 );
		VAL_MAC_CONJ( _y6, bdiag[46], _x4 );
		VAL_MAC_CONJ( _y0, bdiag[5], _x5 );
		VAL_MAC_CONJ( _y1, bdiag[12], _x5 );
		VAL_MAC_CONJ( _y2, bdiag[19], _x5 );
		VAL_MAC_CONJ( _y3, bdiag[26], _x5 );
		VAL_MAC_CONJ( _y4, bdiag[33], _x5 );
		VAL_MAC_CONJ( _y5, bdiag[40], _x5 );
		VAL_MAC_CONJ( _y6, bdiag[47], _x5 );
		VAL_MAC_CONJ( _y0, bdiag[6], _x6 );
		VAL_MAC_CONJ( _y1, bdiag[13], _x6 );
		VAL_MAC_CONJ( _y2, bdiag[20], _x6 );
		VAL_MAC_CONJ( _y3, bdiag[27], _x6 );
		VAL_MAC_CONJ( _y4, bdiag[34], _x6 );
		VAL_MAC_CONJ( _y5, bdiag[41], _x6 );
		VAL_MAC_CONJ( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX(). */
#define MBCSR_HermMatConjMult_v1_aX_b1_xs1_ysX MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatConjMult_v1_aX_b1_xs1_ysX */
#define MBCSR_HermMatConjMult_v1_aX_b1_xs1_ysX MANGLE_MOD_(MBCSR_HermMatConjMult_v1_aX_b1_xs1_ysX_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_HermMatConjMult_v1_aX_b1_xs1_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7*incy, xp += 7 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1] );
		VAL_MUL( _x2, alpha, xp[2] );
		VAL_MUL( _x3, alpha, xp[3] );
		VAL_MUL( _x4, alpha, xp[4] );
		VAL_MUL( _x5, alpha, xp[5] );
		VAL_MUL( _x6, alpha, xp[6] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC( _yp0, bval[8], _x1 );
			VAL_MAC( _yp1, bval[9], _x1 );
			VAL_MAC( _yp2, bval[10], _x1 );
			VAL_MAC( _yp3, bval[11], _x1 );
			VAL_MAC( _yp4, bval[12], _x1 );
			VAL_MAC( _yp5, bval[13], _x1 );
			VAL_MAC( _yp6, bval[14], _x1 );
			VAL_MAC( _yp7, bval[15], _x1 );
			VAL_MAC( _yp0, bval[16], _x2 );
			VAL_MAC( _yp1, bval[17], _x2 );
			VAL_MAC( _yp2, bval[18], _x2 );
			VAL_MAC( _yp3, bval[19], _x2 );
			VAL_MAC( _yp4, bval[20], _x2 );
			VAL_MAC( _yp5, bval[21], _x2 );
			VAL_MAC( _yp6, bval[22], _x2 );
			VAL_MAC( _yp7, bval[23], _x2 );
			VAL_MAC( _yp0, bval[24], _x3 );
			VAL_MAC( _yp1, bval[25], _x3 );
			VAL_MAC( _yp2, bval[26], _x3 );
			VAL_MAC( _yp3, bval[27], _x3 );
			VAL_MAC( _yp4, bval[28], _x3 );
			VAL_MAC( _yp5, bval[29], _x3 );
			VAL_MAC( _yp6, bval[30], _x3 );
			VAL_MAC( _yp7, bval[31], _x3 );
			VAL_MAC( _yp0, bval[32], _x4 );
			VAL_MAC( _yp1, bval[33], _x4 );
			VAL_MAC( _yp2, bval[34], _x4 );
			VAL_MAC( _yp3, bval[35], _x4 );
			VAL_MAC( _yp4, bval[36], _x4 );
			VAL_MAC( _yp5, bval[37], _x4 );
			VAL_MAC( _yp6, bval[38], _x4 );
			VAL_MAC( _yp7, bval[39], _x4 );
			VAL_MAC( _yp0, bval[40], _x5 );
			VAL_MAC( _yp1, bval[41], _x5 );
			VAL_MAC( _yp2, bval[42], _x5 );
			VAL_MAC( _yp3, bval[43], _x5 );
			VAL_MAC( _yp4, bval[44], _x5 );
			VAL_MAC( _yp5, bval[45], _x5 );
			VAL_MAC( _yp6, bval[46], _x5 );
			VAL_MAC( _yp7, bval[47], _x5 );
			VAL_MAC( _yp0, bval[48], _x6 );
			VAL_MAC( _yp1, bval[49], _x6 );
			VAL_MAC( _yp2, bval[50], _x6 );
			VAL_MAC( _yp3, bval[51], _x6 );
			VAL_MAC( _yp4, bval[52], _x6 );
			VAL_MAC( _yp5, bval[53], _x6 );
			VAL_MAC( _yp6, bval[54], _x6 );
			VAL_MAC( _yp7, bval[55], _x6 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y1, bval[8], _xp0 );
			VAL_MAC_CONJ( _y2, bval[16], _xp0 );
			VAL_MAC_CONJ( _y3, bval[24], _xp0 );
			VAL_MAC_CONJ( _y4, bval[32], _xp0 );
			VAL_MAC_CONJ( _y5, bval[40], _xp0 );
			VAL_MAC_CONJ( _y6, bval[48], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y1, bval[9], _xp1 );
			VAL_MAC_CONJ( _y2, bval[17], _xp1 );
			VAL_MAC_CONJ( _y3, bval[25], _xp1 );
			VAL_MAC_CONJ( _y4, bval[33], _xp1 );
			VAL_MAC_CONJ( _y5, bval[41], _xp1 );
			VAL_MAC_CONJ( _y6, bval[49], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y1, bval[10], _xp2 );
			VAL_MAC_CONJ( _y2, bval[18], _xp2 );
			VAL_MAC_CONJ( _y3, bval[26], _xp2 );
			VAL_MAC_CONJ( _y4, bval[34], _xp2 );
			VAL_MAC_CONJ( _y5, bval[42], _xp2 );
			VAL_MAC_CONJ( _y6, bval[50], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y1, bval[11], _xp3 );
			VAL_MAC_CONJ( _y2, bval[19], _xp3 );
			VAL_MAC_CONJ( _y3, bval[27], _xp3 );
			VAL_MAC_CONJ( _y4, bval[35], _xp3 );
			VAL_MAC_CONJ( _y5, bval[43], _xp3 );
			VAL_MAC_CONJ( _y6, bval[51], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y1, bval[12], _xp4 );
			VAL_MAC_CONJ( _y2, bval[20], _xp4 );
			VAL_MAC_CONJ( _y3, bval[28], _xp4 );
			VAL_MAC_CONJ( _y4, bval[36], _xp4 );
			VAL_MAC_CONJ( _y5, bval[44], _xp4 );
			VAL_MAC_CONJ( _y6, bval[52], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y1, bval[13], _xp5 );
			VAL_MAC_CONJ( _y2, bval[21], _xp5 );
			VAL_MAC_CONJ( _y3, bval[29], _xp5 );
			VAL_MAC_CONJ( _y4, bval[37], _xp5 );
			VAL_MAC_CONJ( _y5, bval[45], _xp5 );
			VAL_MAC_CONJ( _y6, bval[53], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y1, bval[14], _xp6 );
			VAL_MAC_CONJ( _y2, bval[22], _xp6 );
			VAL_MAC_CONJ( _y3, bval[30], _xp6 );
			VAL_MAC_CONJ( _y4, bval[38], _xp6 );
			VAL_MAC_CONJ( _y5, bval[46], _xp6 );
			VAL_MAC_CONJ( _y6, bval[54], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_MAC_CONJ( _y1, bval[15], _xp7 );
			VAL_MAC_CONJ( _y2, bval[23], _xp7 );
			VAL_MAC_CONJ( _y3, bval[31], _xp7 );
			VAL_MAC_CONJ( _y4, bval[39], _xp7 );
			VAL_MAC_CONJ( _y5, bval[47], _xp7 );
			VAL_MAC_CONJ( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7*incy, xp += 7 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1] );
		VAL_ASSIGN( _x2, xp[2] );
		VAL_ASSIGN( _x3, xp[3] );
		VAL_ASSIGN( _x4, xp[4] );
		VAL_ASSIGN( _x5, xp[5] );
		VAL_ASSIGN( _x6, xp[6] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC_CONJ( _y1, bdiag[7], _x0 );
		VAL_MAC_CONJ( _y2, bdiag[14], _x0 );
		VAL_MAC_CONJ( _y3, bdiag[21], _x0 );
		VAL_MAC_CONJ( _y4, bdiag[28], _x0 );
		VAL_MAC_CONJ( _y5, bdiag[35], _x0 );
		VAL_MAC_CONJ( _y6, bdiag[42], _x0 );
		VAL_MAC_CONJ( _y0, bdiag[1], _x1 );
		VAL_MAC_CONJ( _y1, bdiag[8], _x1 );
		VAL_MAC_CONJ( _y2, bdiag[15], _x1 );
		VAL_MAC_CONJ( _y3, bdiag[22], _x1 );
		VAL_MAC_CONJ( _y4, bdiag[29], _x1 );
		VAL_MAC_CONJ( _y5, bdiag[36], _x1 );
		VAL_MAC_CONJ( _y6, bdiag[43], _x1 );
		VAL_MAC_CONJ( _y0, bdiag[2], _x2 );
		VAL_MAC_CONJ( _y1, bdiag[9], _x2 );
		VAL_MAC_CONJ( _y2, bdiag[16], _x2 );
		VAL_MAC_CONJ( _y3, bdiag[23], _x2 );
		VAL_MAC_CONJ( _y4, bdiag[30], _x2 );
		VAL_MAC_CONJ( _y5, bdiag[37], _x2 );
		VAL_MAC_CONJ( _y6, bdiag[44], _x2 );
		VAL_MAC_CONJ( _y0, bdiag[3], _x3 );
		VAL_MAC_CONJ( _y1, bdiag[10], _x3 );
		VAL_MAC_CONJ( _y2, bdiag[17], _x3 );
		VAL_MAC_CONJ( _y3, bdiag[24], _x3 );
		VAL_MAC_CONJ( _y4, bdiag[31], _x3 );
		VAL_MAC_CONJ( _y5, bdiag[38], _x3 );
		VAL_MAC_CONJ( _y6, bdiag[45], _x3 );
		VAL_MAC_CONJ( _y0, bdiag[4], _x4 );
		VAL_MAC_CONJ( _y1, bdiag[11], _x4 );
		VAL_MAC_CONJ( _y2, bdiag[18], _x4 );
		VAL_MAC_CONJ( _y3, bdiag[25], _x4 );
		VAL_MAC_CONJ( _y4, bdiag[32], _x4 );
		VAL_MAC_CONJ( _y5, bdiag[39], _x4 );
		VAL_MAC_CONJ( _y6, bdiag[46], _x4 );
		VAL_MAC_CONJ( _y0, bdiag[5], _x5 );
		VAL_MAC_CONJ( _y1, bdiag[12], _x5 );
		VAL_MAC_CONJ( _y2, bdiag[19], _x5 );
		VAL_MAC_CONJ( _y3, bdiag[26], _x5 );
		VAL_MAC_CONJ( _y4, bdiag[33], _x5 );
		VAL_MAC_CONJ( _y5, bdiag[40], _x5 );
		VAL_MAC_CONJ( _y6, bdiag[47], _x5 );
		VAL_MAC_CONJ( _y0, bdiag[6], _x6 );
		VAL_MAC_CONJ( _y1, bdiag[13], _x6 );
		VAL_MAC_CONJ( _y2, bdiag[20], _x6 );
		VAL_MAC_CONJ( _y3, bdiag[27], _x6 );
		VAL_MAC_CONJ( _y4, bdiag[34], _x6 );
		VAL_MAC_CONJ( _y5, bdiag[41], _x6 );
		VAL_MAC_CONJ( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1(). */
#define MBCSR_HermMatConjMult_v1_aX_b1_xsX_ys1 MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatConjMult_v1_aX_b1_xsX_ys1 */
#define MBCSR_HermMatConjMult_v1_aX_b1_xsX_ys1 MANGLE_MOD_(MBCSR_HermMatConjMult_v1_aX_b1_xsX_ys1_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_HermMatConjMult_v1_aX_b1_xsX_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7, xp += 7*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1*incx] );
		VAL_MUL( _x2, alpha, xp[2*incx] );
		VAL_MUL( _x3, alpha, xp[3*incx] );
		VAL_MUL( _x4, alpha, xp[4*incx] );
		VAL_MUL( _x5, alpha, xp[5*incx] );
		VAL_MUL( _x6, alpha, xp[6*incx] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC( _yp0, bval[8], _x1 );
			VAL_MAC( _yp1, bval[9], _x1 );
			VAL_MAC( _yp2, bval[10], _x1 );
			VAL_MAC( _yp3, bval[11], _x1 );
			VAL_MAC( _yp4, bval[12], _x1 );
			VAL_MAC( _yp5, bval[13], _x1 );
			VAL_MAC( _yp6, bval[14], _x1 );
			VAL_MAC( _yp7, bval[15], _x1 );
			VAL_MAC( _yp0, bval[16], _x2 );
			VAL_MAC( _yp1, bval[17], _x2 );
			VAL_MAC( _yp2, bval[18], _x2 );
			VAL_MAC( _yp3, bval[19], _x2 );
			VAL_MAC( _yp4, bval[20], _x2 );
			VAL_MAC( _yp5, bval[21], _x2 );
			VAL_MAC( _yp6, bval[22], _x2 );
			VAL_MAC( _yp7, bval[23], _x2 );
			VAL_MAC( _yp0, bval[24], _x3 );
			VAL_MAC( _yp1, bval[25], _x3 );
			VAL_MAC( _yp2, bval[26], _x3 );
			VAL_MAC( _yp3, bval[27], _x3 );
			VAL_MAC( _yp4, bval[28], _x3 );
			VAL_MAC( _yp5, bval[29], _x3 );
			VAL_MAC( _yp6, bval[30], _x3 );
			VAL_MAC( _yp7, bval[31], _x3 );
			VAL_MAC( _yp0, bval[32], _x4 );
			VAL_MAC( _yp1, bval[33], _x4 );
			VAL_MAC( _yp2, bval[34], _x4 );
			VAL_MAC( _yp3, bval[35], _x4 );
			VAL_MAC( _yp4, bval[36], _x4 );
			VAL_MAC( _yp5, bval[37], _x4 );
			VAL_MAC( _yp6, bval[38], _x4 );
			VAL_MAC( _yp7, bval[39], _x4 );
			VAL_MAC( _yp0, bval[40], _x5 );
			VAL_MAC( _yp1, bval[41], _x5 );
			VAL_MAC( _yp2, bval[42], _x5 );
			VAL_MAC( _yp3, bval[43], _x5 );
			VAL_MAC( _yp4, bval[44], _x5 );
			VAL_MAC( _yp5, bval[45], _x5 );
			VAL_MAC( _yp6, bval[46], _x5 );
			VAL_MAC( _yp7, bval[47], _x5 );
			VAL_MAC( _yp0, bval[48], _x6 );
			VAL_MAC( _yp1, bval[49], _x6 );
			VAL_MAC( _yp2, bval[50], _x6 );
			VAL_MAC( _yp3, bval[51], _x6 );
			VAL_MAC( _yp4, bval[52], _x6 );
			VAL_MAC( _yp5, bval[53], _x6 );
			VAL_MAC( _yp6, bval[54], _x6 );
			VAL_MAC( _yp7, bval[55], _x6 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y1, bval[8], _xp0 );
			VAL_MAC_CONJ( _y2, bval[16], _xp0 );
			VAL_MAC_CONJ( _y3, bval[24], _xp0 );
			VAL_MAC_CONJ( _y4, bval[32], _xp0 );
			VAL_MAC_CONJ( _y5, bval[40], _xp0 );
			VAL_MAC_CONJ( _y6, bval[48], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y1, bval[9], _xp1 );
			VAL_MAC_CONJ( _y2, bval[17], _xp1 );
			VAL_MAC_CONJ( _y3, bval[25], _xp1 );
			VAL_MAC_CONJ( _y4, bval[33], _xp1 );
			VAL_MAC_CONJ( _y5, bval[41], _xp1 );
			VAL_MAC_CONJ( _y6, bval[49], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y1, bval[10], _xp2 );
			VAL_MAC_CONJ( _y2, bval[18], _xp2 );
			VAL_MAC_CONJ( _y3, bval[26], _xp2 );
			VAL_MAC_CONJ( _y4, bval[34], _xp2 );
			VAL_MAC_CONJ( _y5, bval[42], _xp2 );
			VAL_MAC_CONJ( _y6, bval[50], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y1, bval[11], _xp3 );
			VAL_MAC_CONJ( _y2, bval[19], _xp3 );
			VAL_MAC_CONJ( _y3, bval[27], _xp3 );
			VAL_MAC_CONJ( _y4, bval[35], _xp3 );
			VAL_MAC_CONJ( _y5, bval[43], _xp3 );
			VAL_MAC_CONJ( _y6, bval[51], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y1, bval[12], _xp4 );
			VAL_MAC_CONJ( _y2, bval[20], _xp4 );
			VAL_MAC_CONJ( _y3, bval[28], _xp4 );
			VAL_MAC_CONJ( _y4, bval[36], _xp4 );
			VAL_MAC_CONJ( _y5, bval[44], _xp4 );
			VAL_MAC_CONJ( _y6, bval[52], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y1, bval[13], _xp5 );
			VAL_MAC_CONJ( _y2, bval[21], _xp5 );
			VAL_MAC_CONJ( _y3, bval[29], _xp5 );
			VAL_MAC_CONJ( _y4, bval[37], _xp5 );
			VAL_MAC_CONJ( _y5, bval[45], _xp5 );
			VAL_MAC_CONJ( _y6, bval[53], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y1, bval[14], _xp6 );
			VAL_MAC_CONJ( _y2, bval[22], _xp6 );
			VAL_MAC_CONJ( _y3, bval[30], _xp6 );
			VAL_MAC_CONJ( _y4, bval[38], _xp6 );
			VAL_MAC_CONJ( _y5, bval[46], _xp6 );
			VAL_MAC_CONJ( _y6, bval[54], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_MAC_CONJ( _y1, bval[15], _xp7 );
			VAL_MAC_CONJ( _y2, bval[23], _xp7 );
			VAL_MAC_CONJ( _y3, bval[31], _xp7 );
			VAL_MAC_CONJ( _y4, bval[39], _xp7 );
			VAL_MAC_CONJ( _y5, bval[47], _xp7 );
			VAL_MAC_CONJ( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7, xp += 7*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1*incx] );
		VAL_ASSIGN( _x2, xp[2*incx] );
		VAL_ASSIGN( _x3, xp[3*incx] );
		VAL_ASSIGN( _x4, xp[4*incx] );
		VAL_ASSIGN( _x5, xp[5*incx] );
		VAL_ASSIGN( _x6, xp[6*incx] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC_CONJ( _y1, bdiag[7], _x0 );
		VAL_MAC_CONJ( _y2, bdiag[14], _x0 );
		VAL_MAC_CONJ( _y3, bdiag[21], _x0 );
		VAL_MAC_CONJ( _y4, bdiag[28], _x0 );
		VAL_MAC_CONJ( _y5, bdiag[35], _x0 );
		VAL_MAC_CONJ( _y6, bdiag[42], _x0 );
		VAL_MAC_CONJ( _y0, bdiag[1], _x1 );
		VAL_MAC_CONJ( _y1, bdiag[8], _x1 );
		VAL_MAC_CONJ( _y2, bdiag[15], _x1 );
		VAL_MAC_CONJ( _y3, bdiag[22], _x1 );
		VAL_MAC_CONJ( _y4, bdiag[29], _x1 );
		VAL_MAC_CONJ( _y5, bdiag[36], _x1 );
		VAL_MAC_CONJ( _y6, bdiag[43], _x1 );
		VAL_MAC_CONJ( _y0, bdiag[2], _x2 );
		VAL_MAC_CONJ( _y1, bdiag[9], _x2 );
		VAL_MAC_CONJ( _y2, bdiag[16], _x2 );
		VAL_MAC_CONJ( _y3, bdiag[23], _x2 );
		VAL_MAC_CONJ( _y4, bdiag[30], _x2 );
		VAL_MAC_CONJ( _y5, bdiag[37], _x2 );
		VAL_MAC_CONJ( _y6, bdiag[44], _x2 );
		VAL_MAC_CONJ( _y0, bdiag[3], _x3 );
		VAL_MAC_CONJ( _y1, bdiag[10], _x3 );
		VAL_MAC_CONJ( _y2, bdiag[17], _x3 );
		VAL_MAC_CONJ( _y3, bdiag[24], _x3 );
		VAL_MAC_CONJ( _y4, bdiag[31], _x3 );
		VAL_MAC_CONJ( _y5, bdiag[38], _x3 );
		VAL_MAC_CONJ( _y6, bdiag[45], _x3 );
		VAL_MAC_CONJ( _y0, bdiag[4], _x4 );
		VAL_MAC_CONJ( _y1, bdiag[11], _x4 );
		VAL_MAC_CONJ( _y2, bdiag[18], _x4 );
		VAL_MAC_CONJ( _y3, bdiag[25], _x4 );
		VAL_MAC_CONJ( _y4, bdiag[32], _x4 );
		VAL_MAC_CONJ( _y5, bdiag[39], _x4 );
		VAL_MAC_CONJ( _y6, bdiag[46], _x4 );
		VAL_MAC_CONJ( _y0, bdiag[5], _x5 );
		VAL_MAC_CONJ( _y1, bdiag[12], _x5 );
		VAL_MAC_CONJ( _y2, bdiag[19], _x5 );
		VAL_MAC_CONJ( _y3, bdiag[26], _x5 );
		VAL_MAC_CONJ( _y4, bdiag[33], _x5 );
		VAL_MAC_CONJ( _y5, bdiag[40], _x5 );
		VAL_MAC_CONJ( _y6, bdiag[47], _x5 );
		VAL_MAC_CONJ( _y0, bdiag[6], _x6 );
		VAL_MAC_CONJ( _y1, bdiag[13], _x6 );
		VAL_MAC_CONJ( _y2, bdiag[20], _x6 );
		VAL_MAC_CONJ( _y3, bdiag[27], _x6 );
		VAL_MAC_CONJ( _y4, bdiag[34], _x6 );
		VAL_MAC_CONJ( _y5, bdiag[41], _x6 );
		VAL_MAC_CONJ( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1], alpha, _y1 );
		VAL_MAC( yp[2], alpha, _y2 );
		VAL_MAC( yp[3], alpha, _y3 );
		VAL_MAC( yp[4], alpha, _y4 );
		VAL_MAC( yp[5], alpha, _y5 );
		VAL_MAC( yp[6], alpha, _y6 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX(). */
#define MBCSR_HermMatConjMult_v1_aX_b1_xsX_ysX MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatConjMult_v1_aX_b1_xsX_ysX */
#define MBCSR_HermMatConjMult_v1_aX_b1_xsX_ysX MANGLE_MOD_(MBCSR_HermMatConjMult_v1_aX_b1_xsX_ysX_7x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_HermMatConjMult_v1_aX_b1_xsX_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 7*incy, xp += 7*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MUL( _x0, alpha, xp[0] );
		VAL_MUL( _x1, alpha, xp[1*incx] );
		VAL_MUL( _x2, alpha, xp[2*incx] );
		VAL_MUL( _x3, alpha, xp[3*incx] );
		VAL_MUL( _x4, alpha, xp[4*incx] );
		VAL_MUL( _x5, alpha, xp[5*incx] );
		VAL_MUL( _x6, alpha, xp[6*incx] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 7*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC( _yp0, bval[8], _x1 );
			VAL_MAC( _yp1, bval[9], _x1 );
			VAL_MAC( _yp2, bval[10], _x1 );
			VAL_MAC( _yp3, bval[11], _x1 );
			VAL_MAC( _yp4, bval[12], _x1 );
			VAL_MAC( _yp5, bval[13], _x1 );
			VAL_MAC( _yp6, bval[14], _x1 );
			VAL_MAC( _yp7, bval[15], _x1 );
			VAL_MAC( _yp0, bval[16], _x2 );
			VAL_MAC( _yp1, bval[17], _x2 );
			VAL_MAC( _yp2, bval[18], _x2 );
			VAL_MAC( _yp3, bval[19], _x2 );
			VAL_MAC( _yp4, bval[20], _x2 );
			VAL_MAC( _yp5, bval[21], _x2 );
			VAL_MAC( _yp6, bval[22], _x2 );
			VAL_MAC( _yp7, bval[23], _x2 );
			VAL_MAC( _yp0, bval[24], _x3 );
			VAL_MAC( _yp1, bval[25], _x3 );
			VAL_MAC( _yp2, bval[26], _x3 );
			VAL_MAC( _yp3, bval[27], _x3 );
			VAL_MAC( _yp4, bval[28], _x3 );
			VAL_MAC( _yp5, bval[29], _x3 );
			VAL_MAC( _yp6, bval[30], _x3 );
			VAL_MAC( _yp7, bval[31], _x3 );
			VAL_MAC( _yp0, bval[32], _x4 );
			VAL_MAC( _yp1, bval[33], _x4 );
			VAL_MAC( _yp2, bval[34], _x4 );
			VAL_MAC( _yp3, bval[35], _x4 );
			VAL_MAC( _yp4, bval[36], _x4 );
			VAL_MAC( _yp5, bval[37], _x4 );
			VAL_MAC( _yp6, bval[38], _x4 );
			VAL_MAC( _yp7, bval[39], _x4 );
			VAL_MAC( _yp0, bval[40], _x5 );
			VAL_MAC( _yp1, bval[41], _x5 );
			VAL_MAC( _yp2, bval[42], _x5 );
			VAL_MAC( _yp3, bval[43], _x5 );
			VAL_MAC( _yp4, bval[44], _x5 );
			VAL_MAC( _yp5, bval[45], _x5 );
			VAL_MAC( _yp6, bval[46], _x5 );
			VAL_MAC( _yp7, bval[47], _x5 );
			VAL_MAC( _yp0, bval[48], _x6 );
			VAL_MAC( _yp1, bval[49], _x6 );
			VAL_MAC( _yp2, bval[50], _x6 );
			VAL_MAC( _yp3, bval[51], _x6 );
			VAL_MAC( _yp4, bval[52], _x6 );
			VAL_MAC( _yp5, bval[53], _x6 );
			VAL_MAC( _yp6, bval[54], _x6 );
			VAL_MAC( _yp7, bval[55], _x6 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y1, bval[8], _xp0 );
			VAL_MAC_CONJ( _y2, bval[16], _xp0 );
			VAL_MAC_CONJ( _y3, bval[24], _xp0 );
			VAL_MAC_CONJ( _y4, bval[32], _xp0 );
			VAL_MAC_CONJ( _y5, bval[40], _xp0 );
			VAL_MAC_CONJ( _y6, bval[48], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y1, bval[9], _xp1 );
			VAL_MAC_CONJ( _y2, bval[17], _xp1 );
			VAL_MAC_CONJ( _y3, bval[25], _xp1 );
			VAL_MAC_CONJ( _y4, bval[33], _xp1 );
			VAL_MAC_CONJ( _y5, bval[41], _xp1 );
			VAL_MAC_CONJ( _y6, bval[49], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y1, bval[10], _xp2 );
			VAL_MAC_CONJ( _y2, bval[18], _xp2 );
			VAL_MAC_CONJ( _y3, bval[26], _xp2 );
			VAL_MAC_CONJ( _y4, bval[34], _xp2 );
			VAL_MAC_CONJ( _y5, bval[42], _xp2 );
			VAL_MAC_CONJ( _y6, bval[50], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y1, bval[11], _xp3 );
			VAL_MAC_CONJ( _y2, bval[19], _xp3 );
			VAL_MAC_CONJ( _y3, bval[27], _xp3 );
			VAL_MAC_CONJ( _y4, bval[35], _xp3 );
			VAL_MAC_CONJ( _y5, bval[43], _xp3 );
			VAL_MAC_CONJ( _y6, bval[51], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y1, bval[12], _xp4 );
			VAL_MAC_CONJ( _y2, bval[20], _xp4 );
			VAL_MAC_CONJ( _y3, bval[28], _xp4 );
			VAL_MAC_CONJ( _y4, bval[36], _xp4 );
			VAL_MAC_CONJ( _y5, bval[44], _xp4 );
			VAL_MAC_CONJ( _y6, bval[52], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y1, bval[13], _xp5 );
			VAL_MAC_CONJ( _y2, bval[21], _xp5 );
			VAL_MAC_CONJ( _y3, bval[29], _xp5 );
			VAL_MAC_CONJ( _y4, bval[37], _xp5 );
			VAL_MAC_CONJ( _y5, bval[45], _xp5 );
			VAL_MAC_CONJ( _y6, bval[53], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y1, bval[14], _xp6 );
			VAL_MAC_CONJ( _y2, bval[22], _xp6 );
			VAL_MAC_CONJ( _y3, bval[30], _xp6 );
			VAL_MAC_CONJ( _y4, bval[38], _xp6 );
			VAL_MAC_CONJ( _y5, bval[46], _xp6 );
			VAL_MAC_CONJ( _y6, bval[54], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_MAC_CONJ( _y1, bval[15], _xp7 );
			VAL_MAC_CONJ( _y2, bval[23], _xp7 );
			VAL_MAC_CONJ( _y3, bval[31], _xp7 );
			VAL_MAC_CONJ( _y4, bval[39], _xp7 );
			VAL_MAC_CONJ( _y5, bval[47], _xp7 );
			VAL_MAC_CONJ( _y6, bval[55], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 7*7, yp += 7*incy, xp += 7*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _x1;
		REGISTER oski_value_t _x2;
		REGISTER oski_value_t _x3;
		REGISTER oski_value_t _x4;
		REGISTER oski_value_t _x5;
		REGISTER oski_value_t _x6;
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _y1;
		REGISTER oski_value_t _y2;
		REGISTER oski_value_t _y3;
		REGISTER oski_value_t _y4;
		REGISTER oski_value_t _y5;
		REGISTER oski_value_t _y6;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_ASSIGN( _x1, xp[1*incx] );
		VAL_ASSIGN( _x2, xp[2*incx] );
		VAL_ASSIGN( _x3, xp[3*incx] );
		VAL_ASSIGN( _x4, xp[4*incx] );
		VAL_ASSIGN( _x5, xp[5*incx] );
		VAL_ASSIGN( _x6, xp[6*incx] );
		VAL_SET_ZERO( _y0 );
		VAL_SET_ZERO( _y1 );
		VAL_SET_ZERO( _y2 );
		VAL_SET_ZERO( _y3 );
		VAL_SET_ZERO( _y4 );
		VAL_SET_ZERO( _y5 );
		VAL_SET_ZERO( _y6 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC_CONJ( _y1, bdiag[7], _x0 );
		VAL_MAC_CONJ( _y2, bdiag[14], _x0 );
		VAL_MAC_CONJ( _y3, bdiag[21], _x0 );
		VAL_MAC_CONJ( _y4, bdiag[28], _x0 );
		VAL_MAC_CONJ( _y5, bdiag[35], _x0 );
		VAL_MAC_CONJ( _y6, bdiag[42], _x0 );
		VAL_MAC_CONJ( _y0, bdiag[1], _x1 );
		VAL_MAC_CONJ( _y1, bdiag[8], _x1 );
		VAL_MAC_CONJ( _y2, bdiag[15], _x1 );
		VAL_MAC_CONJ( _y3, bdiag[22], _x1 );
		VAL_MAC_CONJ( _y4, bdiag[29], _x1 );
		VAL_MAC_CONJ( _y5, bdiag[36], _x1 );
		VAL_MAC_CONJ( _y6, bdiag[43], _x1 );
		VAL_MAC_CONJ( _y0, bdiag[2], _x2 );
		VAL_MAC_CONJ( _y1, bdiag[9], _x2 );
		VAL_MAC_CONJ( _y2, bdiag[16], _x2 );
		VAL_MAC_CONJ( _y3, bdiag[23], _x2 );
		VAL_MAC_CONJ( _y4, bdiag[30], _x2 );
		VAL_MAC_CONJ( _y5, bdiag[37], _x2 );
		VAL_MAC_CONJ( _y6, bdiag[44], _x2 );
		VAL_MAC_CONJ( _y0, bdiag[3], _x3 );
		VAL_MAC_CONJ( _y1, bdiag[10], _x3 );
		VAL_MAC_CONJ( _y2, bdiag[17], _x3 );
		VAL_MAC_CONJ( _y3, bdiag[24], _x3 );
		VAL_MAC_CONJ( _y4, bdiag[31], _x3 );
		VAL_MAC_CONJ( _y5, bdiag[38], _x3 );
		VAL_MAC_CONJ( _y6, bdiag[45], _x3 );
		VAL_MAC_CONJ( _y0, bdiag[4], _x4 );
		VAL_MAC_CONJ( _y1, bdiag[11], _x4 );
		VAL_MAC_CONJ( _y2, bdiag[18], _x4 );
		VAL_MAC_CONJ( _y3, bdiag[25], _x4 );
		VAL_MAC_CONJ( _y4, bdiag[32], _x4 );
		VAL_MAC_CONJ( _y5, bdiag[39], _x4 );
		VAL_MAC_CONJ( _y6, bdiag[46], _x4 );
		VAL_MAC_CONJ( _y0, bdiag[5], _x5 );
		VAL_MAC_CONJ( _y1, bdiag[12], _x5 );
		VAL_MAC_CONJ( _y2, bdiag[19], _x5 );
		VAL_MAC_CONJ( _y3, bdiag[26], _x5 );
		VAL_MAC_CONJ( _y4, bdiag[33], _x5 );
		VAL_MAC_CONJ( _y5, bdiag[40], _x5 );
		VAL_MAC_CONJ( _y6, bdiag[47], _x5 );
		VAL_MAC_CONJ( _y0, bdiag[6], _x6 );
		VAL_MAC_CONJ( _y1, bdiag[13], _x6 );
		VAL_MAC_CONJ( _y2, bdiag[20], _x6 );
		VAL_MAC_CONJ( _y3, bdiag[27], _x6 );
		VAL_MAC_CONJ( _y4, bdiag[34], _x6 );
		VAL_MAC_CONJ( _y5, bdiag[41], _x6 );
		VAL_MAC_CONJ( _y6, bdiag[48], _x6 );
		VAL_MAC( yp[0], alpha, _y0 );
		VAL_MAC( yp[1*incy], alpha, _y1 );
		VAL_MAC( yp[2*incy], alpha, _y2 );
		VAL_MAC( yp[3*incy], alpha, _y3 );
		VAL_MAC( yp[4*incy], alpha, _y4 );
		VAL_MAC( yp[5*incy], alpha, _y5 );
		VAL_MAC( yp[6*incy], alpha, _y6 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX
/**
 *  \brief Matrix times single-vector multiply in the conj case;
 *  see SymmMatMult_v1().
 */
#define HermMatConjMult_v1 SymmMatMult_v1

#else /* IS_VAL_COMPLEX */


/**
 *  \brief Hermitian matrix times single-vector multiply in the conj case.
 */
static void
HermMatConjMult_v1( oski_index_t M, oski_index_t d0,
	const oski_index_t* bptr, const oski_index_t* bind,
	const oski_value_t* bval, const oski_value_t* bdiag,
	oski_value_t alpha, const oski_value_t* x, oski_index_t incx,
	oski_value_t* y, oski_index_t incy )
{
	if( incx == 1 ) {
		if( incy == 1 ) {
			MBCSR_HermMatConjMult_v1_aX_b1_xs1_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y );
		} else { /* general incy */
			MBCSR_HermMatConjMult_v1_aX_b1_xs1_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y, incy );
		}
	} else { /* general incx */
		if( incy == 1 ) {
			MBCSR_HermMatConjMult_v1_aX_b1_xsX_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y );
		} else { /* general incy */
			MBCSR_HermMatConjMult_v1_aX_b1_xsX_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y, incy );
		}
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX
/**
 *  \brief See SymmMatMult().
 */
#define HermMatConjMult SymmMatMult

#else /* IS_VAL_COMPLEX */



/**
 *  \brief Computes
 *  \f$y \leftarrow y + \alpha\cdot\mathrm{op}(A)\cdot x\f$,
 *  where \f$\mathrm{op}(A) = \bar{A}\f$, on the fully blocked
 *  portion of \f$A\f$.
 */
static int
HermMatConjMult( const oski_submatMBCSR_t* A,
	oski_value_t alpha, const oski_vecview_t x_view,
	oski_vecview_t y_view )
{
	oski_index_t j; /* column number */
	const oski_value_t* xpj; /* X(:, j) */
	oski_value_t* ypj; /* Y(:, j) */

	assert( A->r == 7 );
	assert( A->c == 8 );

	for( j = 0, xpj = x_view->val, ypj = y_view->val;
		j < x_view->num_cols;
		j++, xpj += x_view->colinc, ypj += y_view->colinc )
	{
		HermMatConjMult_v1( A->num_block_rows, A->offset,
			A->bptr, A->bind, A->bval, A->bdiag,
			alpha, xpj, x_view->rowinc, ypj, y_view->rowinc );
	}

	return 0;
}


#endif /* !IS_VAL_COMPLEX */


#if defined(DO_NAME_MANGLING)
/** Mangled name for primary exported symbol */
#define SymmSubmatReprMult MANGLE_MOD_(SymmSubmatReprMult_7x8)
#endif

/**
 *  \brief Computes
 *  \f$y \leftarrow y + \alpha\cdot\mathrm{op}(A)\cdot x\f$,
 *  where \f$A\f$ is stored in 7x8 MBCSR format and
 *  either \f = A^T\f$ or \f = \bar{A}^T\f$.
 *
 *  Set is_herm to a non-zero value if \f\f$ is Hermitian, or
 *  0 if it is symmetric.
 */
int
SymmSubmatReprMult( const oski_submatMBCSR_t* A, int is_herm,
	oski_matop_t opA,
	oski_value_t alpha, const oski_vecview_t x_view,
	oski_vecview_t y_view )
{
	int err;

	if( is_herm ) {
		switch( opA )
		{
			case OP_NORMAL:
			case OP_CONJ_TRANS:
				err = HermMatMult( A, alpha, x_view, y_view );
				break;
			case OP_CONJ:
			case OP_TRANS:
				err = HermMatConjMult( A, alpha, x_view, y_view );
				break;
			default:
				OSKI_ERR_BAD_MATOP( SubmatReprMult, 3, opA );
				err = ERR_BAD_ARG;
		}
	} else { /* is symmetric, but not Hermitian (!is_herm) */
		switch( opA )
		{
			case OP_NORMAL:
			case OP_TRANS:
				err = SymmMatMult( A, alpha, x_view, y_view );
				break;
			case OP_CONJ_TRANS:
			case OP_CONJ:
				err = SymmMatConjMult( A, alpha, x_view, y_view );
				break;
			default:
				OSKI_ERR_BAD_MATOP( SubmatReprMult, 3, opA );
				err = ERR_BAD_ARG;
		}
	}

	return err;
}


/* eof */
