/*
**
** PHiPAC Matrix-Matrix Code for the operation:
**    C = A*B + C
**
** Automatically Generated by mm_cgen ($Revision: 1.27 $) using the command:
**    ./mm_cgen -opA N -opB N -sp 2ma -prec single -l0 2 16 2 -rout mul_mfmf_mf_l0g -file mul_mfmf_mf_l0g.c -alpha 1 
**
** Run './mm_cgen -help' for help.
**
** Generated on: Thursday August 02 2012, 00:37:36 PDT
** Created by: Jeff Bilmes <bilmes@cs.berkeley.edu>
**             http://www.icsi.berkeley.edu/~bilmes/phipac
**
**
** Routine Usage: General (M,K,N) = (M, K, N) matrix multiply: Two stage software pipe [load, mul-add]
**    mul_mfmf_mf_l0g(const int M, const int K, const int N, const float *const A, const float *const B, float *const C, const int Astride, const int Bstride, const int Cstride)
** where
**  A is an MxK matrix
**  B is an KxN matrix
**  C is an MxN matrix
**  Astride is the number of entries between the start of each row of A
**  Bstride is the number of entries between the start of each row of B
**  Cstride is the number of entries between the start of each row of C
**
**
** "Copyright (c) 1995 The Regents of the University of California.  All
** rights reserved."  Permission to use, copy, modify, and distribute
** this software and its documentation for any purpose, without fee, and
** without written agreement is hereby granted, provided that the above
** copyright notice and the following two paragraphs appear in all copies
** of this software.
**
** IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
** DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
** OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF
** CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**
** THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
** INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
** AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
** ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO
** PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
**
*/

/*
 * General (M,K,N) = (M, K, N) matrix multiply: Two stage software pipe [load, mul-add]
 */
void
mul_mfmf_mf_l0g(const int M, const int K, const int N, const float *const A, const float *const B, float *const C, const int Astride, const int Bstride, const int Cstride)
{
   const float *a,*b;
   float *c;
   const float *ap_0,*ap_1;
   const float *bp;
   float *cp;
   const int A_sbs_stride = Astride*2;
   const int C_sbs_stride = Cstride*2;
   const int k_marg_el = ((K>=17)?((K-1)&15):K);
   const int k_norm = K - k_marg_el;
   const int m_marg_el = M & 1;
   const int m_norm = M - m_marg_el;
   const int n_marg_el = N & 1;
   const int n_norm = N - n_marg_el;
   float *const c_endp = C+m_norm*Cstride;
   register float c0_0,c0_1,c1_0,c1_1;
   register float t0_0,t0_1,t1_0,t1_1;
   for (c=C,a=A; c!= c_endp; c+=C_sbs_stride,a+=A_sbs_stride) {
      const float* const ap_endp = a + k_norm;
      float* const cp_endp = c + n_norm;
      const float* const apc_1 = a + Astride;
      for (b=B,cp=c; cp!=cp_endp; b+=2,cp+=2) {
         register float _b0,_b1;
         register float _a0,_a1;
         float *_cp;
         ap_0 = a;
         ap_1 = apc_1;
         bp=b;
         _cp=cp;c0_0=_cp[0];c0_1=_cp[1];
         _cp+=Cstride;c1_0=_cp[0];c1_1=_cp[1];
         if (K >= 17) {
            _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[0];_a1 = ap_1[0];
            bp+=Bstride;
            ap_0+=1;ap_1+=1;
            do {
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[0];_a1 = ap_1[0];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[1];_a1 = ap_1[1];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[2];_a1 = ap_1[2];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[3];_a1 = ap_1[3];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[4];_a1 = ap_1[4];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[5];_a1 = ap_1[5];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[6];_a1 = ap_1[6];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[7];_a1 = ap_1[7];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[8];_a1 = ap_1[8];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[9];_a1 = ap_1[9];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[10];_a1 = ap_1[10];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[11];_a1 = ap_1[11];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[12];_a1 = ap_1[12];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[13];_a1 = ap_1[13];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[14];_a1 = ap_1[14];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               t1_0=_a1*_b0;t1_1=_a1*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               c1_0+=t1_0;c1_1+=t1_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[15];_a1 = ap_1[15];
               bp+=Bstride;
               ap_0+=16;ap_1+=16;
            } while (ap_0 != ap_endp);
            t0_0=_a0*_b0;t0_1=_a0*_b1;
            t1_0=_a1*_b0;t1_1=_a1*_b1;
            c0_0+=t0_0;c0_1+=t0_1;
            c1_0+=t1_0;c1_1+=t1_1;
         }
         if (k_marg_el & 0x10) {
            /* Fixed M,K,N = 2,16,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[0];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[1];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[2];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[2];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[3];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[3];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[4];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[4];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[5];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[5];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[6];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[6];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[7];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[7];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[8];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[8];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[9];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[9];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[10];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[10];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[11];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[11];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[12];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[12];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[13];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[13];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[14];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[14];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[15];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[15];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 

            ap_0+=16;ap_1+=16;
         }
         if (k_marg_el & 0x8) {
            /* Fixed M,K,N = 2,8,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[0];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[1];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[2];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[2];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[3];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[3];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[4];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[4];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[5];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[5];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[6];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[6];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[7];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[7];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 

            ap_0+=8;ap_1+=8;
         }
         if (k_marg_el & 0x4) {
            /* Fixed M,K,N = 2,4,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[0];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[1];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[2];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[2];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[3];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[3];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 

            ap_0+=4;ap_1+=4;
         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 2,2,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[0];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[1];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 

            ap_0+=2;ap_1+=2;
         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 2,1,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            _a1 = ap_1[0];
            c1_0 += _a1*_b0; c1_1 += _a1*_b1; 

         }
         _cp=cp;_cp[0]=c0_0;_cp[1]=c0_1;
         _cp+=Cstride;_cp[0]=c1_0;_cp[1]=c1_1;
      }
   }
   for (c=C,a=A; c!= c_endp; c+=C_sbs_stride,a+=A_sbs_stride) {
      const float* const ap_endp = a + k_norm;
      const float* const apc_1 = a + Astride;
      b = B+n_norm;
      cp = c+n_norm;
      if (n_marg_el & 0x1) {
         register float _b0;
         register float _a0,_a1;
         float *_cp;
         ap_0 = a;
         ap_1 = apc_1;
         bp=b;
         _cp=cp;c0_0=_cp[0];
         _cp+=Cstride;c1_0=_cp[0];
         if (K >= 17) {
            _b0 = bp[0];_a0 = ap_0[0];_a1 = ap_1[0];
            bp+=Bstride;
            ap_0+=1;ap_1+=1;
            do {
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[0];_a1 = ap_1[0];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[1];_a1 = ap_1[1];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[2];_a1 = ap_1[2];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[3];_a1 = ap_1[3];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[4];_a1 = ap_1[4];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[5];_a1 = ap_1[5];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[6];_a1 = ap_1[6];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[7];_a1 = ap_1[7];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[8];_a1 = ap_1[8];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[9];_a1 = ap_1[9];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[10];_a1 = ap_1[10];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[11];_a1 = ap_1[11];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[12];_a1 = ap_1[12];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[13];_a1 = ap_1[13];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[14];_a1 = ap_1[14];
               bp+=Bstride;
               t0_0=_a0*_b0;
               t1_0=_a1*_b0;
               c0_0+=t0_0;
               c1_0+=t1_0;
               _b0 = bp[0];_a0 = ap_0[15];_a1 = ap_1[15];
               bp+=Bstride;
               ap_0+=16;ap_1+=16;
            } while (ap_0 != ap_endp);
            t0_0=_a0*_b0;
            t1_0=_a1*_b0;
            c0_0+=t0_0;
            c1_0+=t1_0;
         }
         if (k_marg_el & 0x10) {
            /* Fixed M,K,N = 2,16,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[0];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[1];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[2];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[2];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[3];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[3];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[4];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[4];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[5];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[5];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[6];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[6];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[7];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[7];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[8];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[8];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[9];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[9];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[10];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[10];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[11];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[11];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[12];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[12];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[13];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[13];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[14];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[14];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[15];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[15];
            c1_0 += _a1*_b0; 

            ap_0+=16;ap_1+=16;
         }
         if (k_marg_el & 0x8) {
            /* Fixed M,K,N = 2,8,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[0];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[1];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[2];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[2];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[3];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[3];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[4];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[4];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[5];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[5];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[6];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[6];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[7];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[7];
            c1_0 += _a1*_b0; 

            ap_0+=8;ap_1+=8;
         }
         if (k_marg_el & 0x4) {
            /* Fixed M,K,N = 2,4,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[0];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[1];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[2];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[2];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[3];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[3];
            c1_0 += _a1*_b0; 

            ap_0+=4;ap_1+=4;
         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 2,2,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[0];
            c1_0 += _a1*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[1];
            c1_0 += _a1*_b0; 

            ap_0+=2;ap_1+=2;
         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 2,1,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; 
            _a1 = ap_1[0];
            c1_0 += _a1*_b0; 

         }
         _cp=cp;_cp[0]=c0_0;
         _cp+=Cstride;_cp[0]=c1_0;
      }
   }
   if (m_marg_el & 0x1) {
      const float* const ap_endp = a + k_norm;
      float* const cp_endp = c + n_norm;
      for (b=B,cp=c; cp!=cp_endp; b+=2,cp+=2) {
         register float _b0,_b1;
         register float _a0;
         float *_cp;
         ap_0 = a;
         bp=b;
         _cp=cp;c0_0=_cp[0];c0_1=_cp[1];
         if (K >= 17) {
            _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[0];
            bp+=Bstride;
            ap_0+=1;
            do {
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[0];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[1];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[2];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[3];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[4];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[5];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[6];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[7];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[8];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[9];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[10];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[11];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[12];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[13];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[14];
               bp+=Bstride;
               t0_0=_a0*_b0;t0_1=_a0*_b1;
               c0_0+=t0_0;c0_1+=t0_1;
               _b0 = bp[0];_b1 = bp[1];_a0 = ap_0[15];
               bp+=Bstride;
               ap_0+=16;
            } while (ap_0 != ap_endp);
            t0_0=_a0*_b0;t0_1=_a0*_b1;
            c0_0+=t0_0;c0_1+=t0_1;
         }
         if (k_marg_el & 0x10) {
            /* Fixed M,K,N = 1,16,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[2];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[3];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[4];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[5];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[6];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[7];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[8];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[9];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[10];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[11];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[12];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[13];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[14];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[15];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 

            ap_0+=16;
         }
         if (k_marg_el & 0x8) {
            /* Fixed M,K,N = 1,8,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[2];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[3];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[4];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[5];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[6];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[7];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 

            ap_0+=8;
         }
         if (k_marg_el & 0x4) {
            /* Fixed M,K,N = 1,4,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[2];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[3];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 

            ap_0+=4;
         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 1,2,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 

            ap_0+=2;
         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 1,1,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; _b1 = bp[1]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; c0_1 += _a0*_b1; 

         }
         _cp=cp;_cp[0]=c0_0;_cp[1]=c0_1;
      }
      if (n_marg_el & 0x1) {
         register float _b0;
         register float _a0;
         float *_cp;
         ap_0 = a;
         bp=b;
         _cp=cp;c0_0=_cp[0];
         if (K >= 17) {
            _b0 = bp[0];_a0 = ap_0[0];
            bp+=Bstride;
            ap_0+=1;
            do {
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[0];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[1];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[2];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[3];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[4];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[5];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[6];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[7];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[8];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[9];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[10];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[11];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[12];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[13];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[14];
               bp+=Bstride;
               t0_0=_a0*_b0;
               c0_0+=t0_0;
               _b0 = bp[0];_a0 = ap_0[15];
               bp+=Bstride;
               ap_0+=16;
            } while (ap_0 != ap_endp);
            t0_0=_a0*_b0;
            c0_0+=t0_0;
         }
         if (k_marg_el & 0x10) {
            /* Fixed M,K,N = 1,16,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[2];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[3];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[4];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[5];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[6];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[7];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[8];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[9];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[10];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[11];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[12];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[13];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[14];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[15];
            c0_0 += _a0*_b0; 

            ap_0+=16;
         }
         if (k_marg_el & 0x8) {
            /* Fixed M,K,N = 1,8,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[2];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[3];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[4];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[5];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[6];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[7];
            c0_0 += _a0*_b0; 

            ap_0+=8;
         }
         if (k_marg_el & 0x4) {
            /* Fixed M,K,N = 1,4,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[2];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[3];
            c0_0 += _a0*_b0; 

            ap_0+=4;
         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 1,2,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; 
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[1];
            c0_0 += _a0*_b0; 

            ap_0+=2;
         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 1,1,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap_0[0];
            c0_0 += _a0*_b0; 

         }
         _cp=cp;_cp[0]=c0_0;
      }
   }
}
