!==============================================================================
!
! Routines:
!
! (1) intkernel()       Originally By MLT        Last Modified: 6/16/2011 (FHJ)
!
!     input: crys, kg, kp, epsi, xct, peinf types
!            hmtrx     effective Hamiltonian, with only the diagonal part
!            dcc, dvv  transformation matrices
!            kco       coordinates of k-points in the coarse grid
!            imap      map of k-points fine grid -> coarse grid;
!                      defines which point in the coarse grid is close to
!                      one in the fine grid
!     output: hmtrx    effective Hamiltonian, with diagonal part and
!                      interaction part (direct + exchange)
!
!     Build the interaction Kernel using the interpolation scheme
!     (see Rohlfing & Louie). The kernel on the coarse grid is read
!     from tape, "bsedmat" and "bsexmat", and used immediately
!     (using temporary files may speed up the calculation...)
!
!     interaction kernel K =  K_d + 2*K_x   <---- spin singlet
!                        K =  K_d           <---- spin triplet
!                        K =  2*K_x         <---- local fields
!
!     imatrix code: 1 dhead, 2 dwing, 3 dbody, 4 x, 5-10 dynamic screening
!
! (2) interpolate()     Originally By MLT        Last Modified: 7/1/2008 (JRD)
!
!     input: xct type
!            bse_co    kernel in the coarse grid
!            dcck      dcc transf. matrix at point k
!            dcckp     dcc transf. matrix at point k`
!            dvvk      dvv transf. matrix at point k
!            dvvkp     dvv transf. matrix at point k`
!     output: bse_fi   (interpolated) kernel in the fine grid
!
!==============================================================================

#include "f_defs.h"

module intkernel_m

  use global_m
  use blas_m
  use intpts_m
  use vcoul_generator_m
  implicit none
  
  public :: intkernel

contains

subroutine intkernel(crys,kg,kp,epsi,xct,hmtrx,dcc,dvv,kco,imap,flag,eqp,gvec)
  type (crystal), intent(in) :: crys
  type (grid), intent(in) :: kg
  type (kpoints), intent(in) :: kp
  type (epsinfo), intent(in) :: epsi
  type (xctinfo), intent(inout) :: xct
  SCALAR, intent(inout) :: hmtrx(:,:) ! (xct%nkpt*xct%ncband*xct%nvband*xct%nspin,peinf%nblocks*peinf%nblockd)
  SCALAR, intent(in) :: dcc(:,:,:,:), dvv(:,:,:,:) ! (xct%nkpt,xct%nvband,xct%nvb_co,xct%nspin)
  real(DP), intent(in) :: kco(:,:) ! (3,xct%nkpt_co)
  integer, intent(in) :: imap(:) ! (xct%nkpt)
  integer, intent(in) :: flag
  type (eqpinfo), intent(in) :: eqp
  type (gspace), intent(in) :: gvec

  type (twork_scell) :: work_scell
  real(DP) :: vcoul, oneoverq
  real(DP) :: vcoul0(1), closeweights(4)

! FHJ: these arrays depend only on the distance w.r.t qq=0 (inside the BZ)
  real(DP), allocatable :: dist_array(:) !length of qq for a given index
  real(DP), allocatable :: vcoul_array(:), oneoverq_array(:)

! FHJ: variables used for the cell structure/linked lists
  real(DP), allocatable :: cell_head(:), cell_list(:)
  real(DP) :: cell_dmin, cell_dmax, cell_factor
  integer :: cell_N
  integer :: jjcnt, isrtrq(1)
  integer :: closepts(4)
  
  integer, parameter :: ncell = 3
  
  integer :: iscreentemp,iparallel,ijk,icb,ivb,imatrix, &
    ik,ic,iv,ikp,icp,ivp,ikt,ikcvs,ikcvst,jj,jc,jv,js, &
    jcp,jvp,jsp,jk,jkp,dimbse,nmatrices,ifile, &
    i1,i2,i3,iold,icout,ivout,ncdim,nvdim,inew
  integer, allocatable :: indx(:)
  real(DP) :: diffl,difmin,diff(3),qlen,valuef, &
    valuet,fac_d,fac_x,hden,qq(3),fqmin(3),eps,kx,ky,kz, &
    tol,tsec(2),factor

!-------------------------------
! Dynamic screening

  real(DP) :: invwtilde,omegacpv,omegacvp,hdyn1,hdyn2

!------------------------------
! kernel and transformation matrices

  SCALAR, allocatable :: &
    bsedmatrix(:,:,:,:,:,:,:),bsedmt(:,:,:,:,:,:), &
    dcck(:,:,:),dcckp(:,:,:),dvvk(:,:,:),dvvkp(:,:,:)

  if(abs(xct%scaling) < TOL_Zero) return ! everything will be zero

  PUSH_SUB(intkernel)

  SAFE_ALLOCATE(indx, (xct%nkpt_co))

  call timacc(51,1,tsec)

  tol=TOL_Small
  factor = -8.d0*PI_D/(crys%celvol*xct%nktotal)
  if (flag.eq.0) then
    if(peinf%inode.eq.0) write(6,'(a)') 'Spin Triplet Kernel'
    fac_d = factor
    fac_x = 0d0
  elseif (flag.eq.1) then
    if(peinf%inode.eq.0) write(6,'(a)') 'Spin Singlet Kernel'
    fac_d = factor
    fac_x = factor
  elseif (flag.eq.2) then
    if(peinf%inode.eq.0) write(6,'(a)') 'Local Fields'
    fac_d = 0d0
    fac_x = factor
  else
    write(0,*) 'flag = ', flag
    call die("Illegal value of flag in intkernel.")
  endif
  
  call timacc(51,2,tsec)

!-------------------------------
! We should all contribute to calculating vcoul0 since it might involve minibz average

  if (peinf%inode .eq. 0) then
    write(6,701) xct%nkpt
  endif
  701 format(/,1x,'Calculating Vcoul with ',i6,' kpts')

  call timacc(55,1,tsec)
  
  SAFE_ALLOCATE(dist_array, (xct%nkpt))
  SAFE_ALLOCATE(vcoul_array, (xct%nkpt))
  SAFE_ALLOCATE(oneoverq_array, (xct%nkpt))
  iscreentemp=0
  iparallel=1

  inew = 0

  if(peinf%inode == 0) call checkgriduniformity(kp%kgrid, crys, xct%icutv)

#ifdef VERBOSE
  if (peinf%inode .eq. 0) then
    write(6,'(a)')
    write(6,1000) 'Point','kpt','qq  = kg%f(:,ik)-kg%f(:,1)','   |qq|^2   ','index'
    write(6,1000) '-----','---','--------------------------','------------','-----'
1000  format(2x, a5,1x, a5,1x, a26,1x, a12,1x, a5)
1001  format(2x, a5,1x, i5,1x, 3(f8.5,1x), f12.9,1x, i5)
    endif
#endif

  do ik = 1, xct%nkpt
    qq(:) = kg%f(:,ik) - kg%f(:,1)
    
! FHJ: Map these points into distances calculated in the BZ.
    difmin = INF
    do i1=-ncell,ncell+1
      diff(1) = qq(1) - i1
      do i2=-ncell,ncell+1
        diff(2) = qq(2) - i2
        do i3=-ncell,ncell+1
          diff(3) = qq(3) - i3
          diffl = DOT_PRODUCT(diff,MATMUL(crys%bdot,diff))
          if (diffl.lt.difmin) then
            difmin = diffl
            fqmin(:) = diff(:)
          endif
        enddo
      enddo
    enddo
    qq(:)=fqmin(:)

! FHJ: Take a look if the integral at this dist has already been calculated
    iold=0
    do jj=inew,1,-1
      if (abs(dist_array(jj)-difmin).lt.TOL_Small) then
        iold=jj
        exit
      endif
    enddo

    if (iold==0) then
      inew=inew+1
      dist_array(inew)=difmin
      vcoul0(1)=0.0d0
#ifdef VERBOSE
      if (peinf%inode .eq. 0) then
        write(6,1001) '~NEW~', ik, qq, difmin, inew
      endif
#endif
      isrtrq(1) = 1
      call vcoul_generator(xct%icutv,xct%truncval,gvec, &
        crys%bdot,xct%nktotal,1,isrtrq(:),iscreentemp,qq,xct%q0vec, &
        vcoul0(:),xct%iwritecoul,iparallel,xct%avgcut,oneoverq, &
        kp%kgrid,xct%epshead,work_scell,xct%averagew,xct%wcoul0)
     
      vcoul_array(inew)=vcoul0(1)
      oneoverq_array(inew)=oneoverq
    else
#ifdef VERBOSE
      if (peinf%inode .eq. 0) then
        write(6,1001) ' old ', ik, qq, difmin, iold
      endif
#endif
    endif !iold .== 0
  enddo !ik

  call destroy_qran()

#ifdef VERBOSE
  if (peinf%inode .eq. 0) then
    write(6,'(a)')
    write(6,*) 'Finished calculating Vcoul with ',inew,'unique points'
  endif
#endif

  cell_N=inew
  SAFE_ALLOCATE(cell_head, (cell_N))
  SAFE_ALLOCATE(cell_list, (inew))

! FHJ: Initialize and populate cells
  do jj=1,cell_N
    cell_head(jj) = 0
  enddo
  cell_dmin =  INF
  cell_dmax = -INF
  do jj=1,inew
    if (dist_array(jj).lt.cell_dmin) cell_dmin = dist_array(jj)
    if (dist_array(jj).gt.cell_dmax) cell_dmax = dist_array(jj)
  enddo
  cell_dmin = cell_dmin - TOL_Small
  cell_dmax = cell_dmax + TOL_Small
  cell_factor = cell_N/(cell_dmax - cell_dmin)

#ifdef VERBOSE
  if (peinf%inode .eq. 0) then
    write(6,'(a)')
    write(6,801) cell_dmin,cell_dmax,cell_factor
801 format(' Init cells, dmin= ',f8.5,' dmax= ',f8.5,' 1/size= ',f12.5)
  endif
#endif
  do jj=1,inew
    i1 = int((dist_array(jj)-cell_dmin)*cell_factor)+1
    if (i1.gt.cell_N .or. i1.lt.1) then
      if (peinf%inode .eq. 0) then
        write(0,'(a)')
        write(0,*) 'Invalid index for cell! jj=',jj,' index=',i1
      endif
      call die('Invalid index for cell', only_root_writes = .true.)
    endif
    cell_list(jj) = cell_head(i1)
    cell_head(i1) = jj
  enddo
#ifdef VERBOSE
  if (peinf%inode .eq. 0) then
    write(6,'(a)')
    write(6,'(a)') 'Cell Population Analysis'
    write(6,'(a)')
    write(6,900) ' cell ',' members '
    write(6,900) '------','---------'
900 format(2x, a6,1x, a9)
    do jj=1,cell_N
      write(6,'(2x,i6)',advance='no') jj
      i1=cell_head(jj)
      do while (i1.gt.0)
        write(6,'(1x,i5)',advance='no') i1
        i1=cell_list(i1)
      enddo
      write(6,'(a)')
    enddo
  endif
#endif

  if (xct%icutv .eq. 7) then
    SAFE_DEALLOCATE_P(work_scell%fftbox_1D)
  endif
  
  call timacc(55,2,tsec)
  call timacc(51,1,tsec)

!--------------------------------
! Allocate data

  if (peinf%inode .eq. 0) then
    if (xct%iskipinterp .eq. 0) then
      write(6,'(/,1x,a)') 'Performing Kernel Interpolation'
    else
      write(6,'(/,1x,a)') 'Building Interaction Kernel'
    endif
  endif

!  if(peinf%inode.eq.0) write(6,'(a)') 'Data allocation: intkernel'
  SAFE_ALLOCATE(bsedmatrix, (xct%nvb_co,xct%ncb_co,xct%nvb_co,xct%ncb_co,xct%nspin,xct%nspin,xct%nkpt_co))
  if (xct%ipar .eq. 1) then
    SAFE_ALLOCATE(bsedmt, (xct%nvband,xct%ncband,xct%nvband,xct%ncband,xct%nspin,xct%nspin))
  else if (xct%ipar .eq. 2) then
    SAFE_ALLOCATE(bsedmt, (xct%nvband,xct%ncband,xct%nvband,1,xct%nspin,xct%nspin))
  else
    SAFE_ALLOCATE(bsedmt, (xct%nvband,xct%ncband,1,1,xct%nspin,xct%nspin))
  endif
  SAFE_ALLOCATE(dcck, (xct%ncband,xct%ncb_co,xct%nspin))
  SAFE_ALLOCATE(dcckp, (xct%ncb_co,xct%ncband,xct%nspin))
  SAFE_ALLOCATE(dvvk, (xct%nvband,xct%nvb_co,xct%nspin))
  SAFE_ALLOCATE(dvvkp, (xct%nvb_co,xct%nvband,xct%nspin))
  
  dimbse=xct%nkpt_co*(xct%ncb_co*xct%nvb_co*xct%nspin)**2
  indx=0

#ifdef MPI
  call MPI_BARRIER(MPI_COMM_WORLD,mpierr)
#endif

  call timacc(51,2,tsec)
  call timacc(52,1,tsec)

  if (peinf%inode.eq.0) then

!----------------------------------------
! Check consistency of files
! indx(ikp): index of coarse-grid point ikp in files bsedmat,bsexmat

    call open_file(unit=11,file='bsedmat',form='unformatted',status='old')
    call open_file(unit=12,file='bsexmat',form='unformatted',status='old')
    if (xct%dynamic_screening) then
      call open_file(unit=13,file='bsedmat1',form='unformatted',status='old')
      call open_file(unit=14,file='bsedmat2',form='unformatted',status='old')
    endif
    i1=0
    read(11) jk,jc,jv,js
    if (xct%dynamic_screening) then
      read(13)
      read(14)
    endif

    if(jk /= xct%nkpt_co) then
      write(0,*) 'File has ', jk, '; we need ', xct%nkpt_co
      call die("bsedmat does not have the correct number of k-points in coarse grid")
    endif
    if(jc /= xct%ncb_co) then
      write(0,*) 'File has ', jc, '; we need ', xct%ncb_co
      call die("bsedmat does not have the correct number of conduction bands in coarse grid")
    endif
    if(jv /= xct%nvb_co) then
      write(0,*) 'File has ', jv, '; we need ', xct%nvb_co
      call die("bsedmat does not have the correct number of valence bands in coarse grid")
    endif
    if(js /= xct%nspin) then
      write(0,*) 'File has ', js, '; we need ', xct%nspin
      call die("bsedmat does not have the correct number of spins")
    endif
    
    read(12) jk,jc,jv,js

    if(jk /= xct%nkpt_co) then
      write(0,*) 'File has ', jk, '; we need ', xct%nkpt_co
      call die("bsexmat does not have the correct number of k-points in coarse grid")
    endif
    if(jc /= xct%ncb_co) then
      write(0,*) 'File has ', jc, '; we need ', xct%ncb_co
      call die("bsexmat does not have the correct number of conduction bands in coarse grid")
    endif
    if(jv /= xct%nvb_co) then
      write(0,*) 'File has ', jv, '; we need ', xct%nvb_co
      call die("bsexmat does not have the correct number of valence bands in coarse grid")
    endif
    if(js /= xct%nspin) then
      write(0,*) 'File has ', js, '; we need ', xct%nspin
      call die("bsexmat does not have the correct number of spins")
    endif
    
    do jj=1,xct%nkpt_co
      read(11) jk,kx,ky,kz
      if (xct%dynamic_screening) then
        read(13)
        read(14)
      endif
      do ik=xct%nkpt_co,1,-1
        if (abs(kx-kco(1,ik)).lt.tol.and.abs(ky-kco(2,ik)).lt. &
          tol.and.abs(kz-kco(3,ik)).lt.tol) then
          ikp=ik
          exit
        endif
      enddo
      indx(ikp)=jk
      if (jk.ne.ikp) i1=i1+1
      read(12) jk,kx,ky,kz
      if (abs(kx-kco(1,ikp)).gt.tol.or.abs(ky-kco(2,ikp)).gt. &
        tol.or.abs(kz-kco(3,ikp)).gt.tol) then
        write(0,*) jj, jk, ikp
        call die('kco do not match!')
      endif
    enddo

    if (i1.ne.0) then
      write(6,'(a)')
      write(6,'(a)') ' Files bsedmat/bsexmat have different map of points in coarse grid. Updating map. '
      write(6,'(a)')
      do jk=1,xct%nkpt_co
        write(6,'(2i4)') jk,indx(jk)
      enddo
    endif
    
  endif     ! peinf%inode = 0
#ifdef MPI
  call MPI_BCAST(indx, xct%nkpt_co, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
#endif

! FHJ: Prepare the cells for the interpolation of epsilon
  call alloc_intpts(epsi%nq,epsi%q(:,:),umklapp = .true.)
  call timacc(52,2,tsec)

  call logit('Done calculating Vcoul')

!------------- Read in coarse matrices: head, wings, body, exchange --------------------

! PE # 0 reads and broadcasts to everybody

  if (xct%dynamic_screening) then
    nmatrices = 10
  else
    nmatrices = 4
  endif

#ifdef VERBOSE
  if (peinf%inode.eq.0) write(6,'(a)') ' Read in coarse matrices: head, wings, body, exchange '
#endif
  do jk=1,xct%nkpt_co

#ifdef VERBOSE
    call logit('new jk')
    if (peinf%inode.eq.0) then
      write(6,*) ' ------- jk = ',jk,' ------- '
    endif
#endif

    do imatrix = 1, nmatrices     
      ! triplet kernel has no exchange term
      if(imatrix == 4 .and. flag == 0) cycle
      ! local-field kernel has no direct term
      if((imatrix >= 1 .and. imatrix <= 3) .and. flag == 2) cycle

      call timacc(53,1,tsec)

      bsedmatrix = 0.0
      if (peinf%inode.eq.0) then
        if (imatrix .lt. 4) then
          ifile=11
        else if (imatrix .eq. 4) then
          ifile=12
        else if (imatrix .lt. 8) then
          ifile=13
        else
          ifile=14
        endif
        
        do jj=1, xct%ncb_co * xct%nvb_co
          read(ifile) ik,jc,jv,(((((bsedmatrix(jv,jc,jvp,jcp,js,jsp,jkp), &
            jsp=1,xct%nspin),js=1,xct%nspin),jvp=1,xct%nvb_co), &
            jcp=1,xct%ncb_co),jkp=1,xct%nkpt_co)
        enddo
      endif
      
#ifdef MPI
      call MPI_BCAST(bsedmatrix(1,1,1,1,1,1,1), dimbse, MPI_SCALAR, 0, MPI_COMM_WORLD, mpierr)
#endif
      call timacc(53,2,tsec)

! For each kp point in the fine grid, perform interpolation in
! every matrix element related to that point.
        
      do ik = 1, xct%nkpt
        if (indx(imap(ik)).eq.jk) then
          do jkp = 1,xct%nkpt_co
            
            do ikt = 1, peinf%ibt(peinf%inode+1)
              ikp=peinf%ikb(peinf%inode+1,ikt)
              if (indx(imap(ikp)).eq.jkp) then
                bsedmt = 0.0
                qq(:) = kg%f(:,ik) - kg%f(:,ikp)
                
! Reduce to BZ (it doesn not matter if qq is on the boundary of the
! 1st BZ as long as epsi%q sample the 1st BZ and its surroundings

                difmin = INF
                do i1=-ncell+1,ncell
                  diff(1) = qq(1) - i1
                  do i2=-ncell+1,ncell
                    diff(2) = qq(2) - i2
                    do i3=-ncell+1,ncell
                      diff(3) = qq(3) - i3
                      diffl = DOT_PRODUCT(diff,MATMUL(crys%bdot,diff))
                      if (diffl.lt.difmin) then
                        difmin = diffl
                        fqmin(:) = diff(:)
                      endif
                    enddo
                  enddo
                enddo
                qq(:) = fqmin(:)

! FHJ: Find the corresponding integrals by comparing the current distance
!      with the ones saved in dist_array. Also, use the cell partitioning
!      scheme to speed things up
                iold=0 !index found? (valid if >0)
! FHJ: Because of rounding errors, the calculated distance might not be in
!      the right cell. So, we should also look one cell to the left/right.
!      jjcnt=1 -> no shift, jjcnt=2 -> left, jjcnt=3 0> right
jjcnt_do:       do jjcnt=1,3 !try finding the point 
                  jj = int((difmin-cell_dmin)*cell_factor)+1
                  if (jjcnt==2) then !move one cell to the left
                    jj = jj - 1
                    if (jj<1) jj = cell_N
                  else if (jjcnt==3) then !move one cell to the right
                    jj = jj + 1
                    if (jj>cell_N) jj = 1
                  endif
                  i1 = cell_head(jj)
                  do while(i1>0)
                    if (abs(dist_array(i1)-difmin).lt.TOL_Small) then
                      iold = i1
                      exit jjcnt_do
                    endif
                    i1 = cell_list(i1)
                  enddo
                enddo jjcnt_do

                if (iold==0) then
                  write(0,*) 'Found a distance that was not calculated before!'
                  write(0,*) 'dist=',difmin
                  write(0,*) 'cell=',int((difmin-cell_dmin)*cell_factor)+1
                  call die('List error')
                endif

                if (difmin.gt.epsi%emax) write(0,*) ' WARNING: emax too small, ', difmin, epsi%emax

! Interpolate epsinv - Just the head

! SIB:  if q=0 and we are truncating the coulomb interaction, then
!       the dielectric is 1 at q=0 for semiconductors.

                call timacc(54,1,tsec)

                if (xct%icutv.ne.0.and.xct%iscreen.eq.0.and. &
                  sum(abs(qq)).lt.TOL_Zero) then
                  if (peinf%inode.eq.0) write(6,'(a)') 'Coulomb truncation: epsinv(0,0) set to 1 for q=0'
                  eps = 1.0d0
                else
                  
                  call intpts_local(crys,qq,epsi%nq,epsi%q(:,:),xct,closepts(:),closeweights(:), umklapp = .true.)
                  eps=0D0
                  do ijk = 1, xct%idimensions + 1
                    eps = eps + closeweights(ijk)*epsi%eps(closepts(ijk)) 
                  enddo

! Use model eps istead?

!                  call epsmodel(crys,qq,eps)
  
                endif

                call timacc(54,2,tsec)


!------------------- Calculate Coulomb Interaction-------------------------------------
!
! SIB: Here we calculate the values of 1/q^2 and 1/q (no truncation case)
! for the fine q vector q=k-kp.  Things to note:
!
! (1) If q=0, then we want instead the averages of 1/q^2 and 1/q over
!     the "mini-BZ" (mini-BZ is the 1st BZ but scaled down by the
!     number of total q-points).  We do this by approximating the mini-BZ
!     by a sphere of radius rcell.  The 3/rcell^2 and 3/2rcell are just
!     then the appropriate averages over the spherical mini-BZ.
!
!JRD:
! (2) If we are using truncation, the wing is DEFINED as being
!     some smooth function multiplied by |q|*Vtrunc(G=0,q)*eps^-1(G=0,G`=0,q)
!     This is because the wings of the dielectric matrix are /propto |q| and pick up a factor
!     of eps^-1(G=0,G`=0,q) during the inversion process (see Epsilon/epsinv.f90).
!     We include this factor here and not above because eps^-1(G=0,G`=0,q) varies quicker in truncated case.
!     The Vtrunc(G=0,q) factor comes from the bare (truncated) coulomb interaction.

! We do it as if it were a semiconductor because we want q=0 here since the divergent
! parts were already removed in kernel

                qlen = sqrt(DOT_PRODUCT(qq,MATMUL(crys%bdot,qq)))
                
                vcoul=vcoul_array(iold)
                oneoverq=oneoverq_array(iold)
                
                vcoul=vcoul/(8.0*Pi_D)
                oneoverq=oneoverq/(8.0*Pi_D)
                
                call timacc(56,1,tsec)
                
                if (qlen .lt. TOL_Zero .and. xct%iscreen .eq. 0) then
                  hden = xct%wcoul0/(8.0*Pi_D*eps)
                else
                  hden = vcoul
                endif
                
                call timacc(56,2,tsec)

!---------------- Start interpolation for the present pair (k,kp)----------------------------

                do js=1,xct%nspin
                  dcck(:,:,js) = dcc(ik,:,:,js)
                  do icp=1,xct%ncband
                    dcckp(:,icp,js) = dcc(ikp,icp,:,js)
                  enddo
                  dvvk(:,:,js) = dvv(ik,:,:,js)
                  do ivp=1,xct%nvband
                    dvvkp(:,ivp,js) = dvv(ikp,ivp,:,js)
                  enddo
                enddo

!-----------------------------
! Head (spin diagonal)

                if (xct%ipar .eq. 1) then
                  ncdim=xct%ncband
                  nvdim=xct%nvband
                  icout=1
                  ivout=1
                else if (xct%ipar .eq. 2) then
                  ncdim=1
                  nvdim=xct%nvband
                  icout=peinf%icb(peinf%inode+1,ikt)
                  ivout=1
                else if (xct%ipar .eq. 3) then
                  ncdim=1
                  nvdim=1
                  icout=peinf%icb(peinf%inode+1,ikt)
                  ivout=peinf%ivb(peinf%inode+1,ikt)
                endif

                call timacc(57,1,tsec)

                if (xct%iskipinterp .eq. 0) then
                  call interpolate(xct,nvdim,ncdim,bsedmatrix(:,:,:,:,:,:,jkp),bsedmt,dcck,dcckp, &
                    dvvk,dvvkp,ivout,icout,imatrix,flag)
                else
                  do jsp=1,xct%nspin
                    do js=1,xct%nspin
                      do jcp=1,ncdim
                        do jvp=1,nvdim
                          do jc=1,xct%ncband
                            do jv=1,xct%nvband
                              bsedmt(jv,jc,jvp,jcp,js,jsp) = &
                                bsedmatrix(jv,jc,jvp+ivout-1,jcp+icout-1,js,jsp,jkp)
                            enddo
                          enddo
                        enddo
                      enddo
                    enddo
                  enddo
                endif

                call timacc(57,2,tsec)

!------------------------------
! Add interaction kernel to the Hamiltonian

                call timacc(58,1,tsec)

! JRD: valuef is the prefactor for the various head/wing/body/exchange and screening

                if (imatrix .eq. 1) then
                  if (xct%iscreen .eq. 0) then
                    valuef = fac_d * eps * hden
                  elseif (xct%iscreen .eq. 1) then
                    if (xct%icutv .eq. 0) then
                      valuef = fac_d * oneoverq
                    else
                      valuef = fac_d * eps * hden
                    endif
                  else
                    valuef = fac_d
                  endif
                else if (imatrix .eq. 2) then
                  if (xct%iscreen .eq. 0 .and. xct%icutv .eq. 0) then
                    valuef = fac_d * oneoverq
                  else
                    valuef = fac_d
                  endif
                else if (imatrix .eq. 3) then
                  valuef = fac_d
                else if (imatrix .eq. 4) then
                  valuef = -fac_x
                  if (xct%nspin .eq. 1) valuef = valuef * 2D0
                else
                  invwtilde = sqrt((1.0d0-eps)/(xct%wplasmon**2))
                  hdyn1 = (eps-1.0d0)*invwtilde
                  hdyn2 = hdyn1*invwtilde
                  if ( imatrix .eq. 5) then
                    valuef = fac_d*hden*0.5d0*hdyn1
                  else if ( imatrix .eq. 6 ) then
                    valuef = oneoverq * fac_d
                  else if ( imatrix .eq. 7) then
                    valuef = 0.5D0*fac_d
                  else if ( imatrix .eq. 8) then
                    valuef = fac_d*hden*0.5d0*hdyn2
                  else if ( imatrix .eq. 9) then
                    valuef = oneoverq*fac_d
                  else if ( imatrix .eq. 10) then
                    valuef = 0.5D0*fac_d
                  endif
                endif

                valuet=valuef
                
                if( .not. xct%dynamic_screening .and. abs(valuet) < TOL_Zero) cycle

                do icp = 1, xct%ncband
                  if (xct%ipar .eq. 1 .or. icp .eq. 1) then
                    do ivp = 1, xct%nvband
                      if (xct%ipar .lt. 3 .or. ivp .eq. 1) then
                        do jsp = 1, xct%nspin
                          if (xct%ipar .eq. 1) then 
                            ikcvst= jsp + (ivp - 1 + (icp - 1 + (ikt - 1)* &
                              xct%ncband)*xct%nvband)*xct%nspin
                            icb=icp
                            ivb=ivp
                          else if (xct%ipar .eq. 2) then
                            ikcvst= jsp + ((ivp - 1 + (ikt - 1)* &
                              xct%nvband))*xct%nspin
                            !icb=peinf%icb(peinf%inode+1,ikt)
                            icb=1
                            ivb=ivp
                          else
                            ikcvst= jsp + (((ikt - 1)))*xct%nspin
                            !icb=peinf%icb(peinf%inode+1,ikt)
                            !ivb=peinf%ivb(peinf%inode+1,ikt)
                            icb=1
                            ivb=1
                          endif
                          do ic = 1, xct%ncband
                            do iv = 1, xct%nvband
                              do js = 1,xct%nspin
                                ikcvs= js + (iv - 1 + (ic - 1 + (ik - 1)*xct%ncband)*xct%nvband)*xct%nspin
                                
                                if ( xct%dynamic_screening ) then
                                  omegacpv = xct%dynamic_energy - &
                                    eqp%ecqp(icp,ikp,jsp) + eqp%evqp(iv,ik,js)
                                  omegacvp = xct%dynamic_energy - &
                                    eqp%ecqp(ic,ik,js) + eqp%evqp(ivp,ikp,jsp)
                                  if (imatrix .eq. 5 .or. imatrix .eq. 7) then
                                    valuet=valuef*(omegacpv+omegacvp)        
                                  else if (imatrix .eq. 10 .or. imatrix .eq. 8) then
                                    valuet=valuef*(omegacpv**2+omegacvp**2)
                                  endif
                                endif
                                
                                hmtrx(ikcvs,ikcvst)=hmtrx(ikcvs,ikcvst) + &
                                  xct%scaling * valuet * bsedmt(iv,ic,ivb,icb,js,jsp)
                              enddo              ! js
                            enddo              ! iv
                          enddo              ! ic
                        enddo              ! jsp
                      endif
                    enddo              ! ivp
                  endif
                enddo              ! icp
                
                call timacc(58,2,tsec)
                
              endif
              
            enddo              ! ikt
          enddo                ! jkp
        endif
      enddo               ! ik
    enddo !imatrix
    
!------------------------
! Synchronization

!#ifdef MPI
!    call MPI_BARRIER(MPI_COMM_WORLD,mpierr)
!#endif
  enddo                !jk

  if (peinf%inode.eq.0) then
    call close_file(11)
    call close_file(12)
    if (xct%dynamic_screening) then
      call close_file(13)
      call close_file(14)
    endif
  endif
  
  SAFE_DEALLOCATE(cell_head)
  SAFE_DEALLOCATE(cell_list)
  SAFE_DEALLOCATE(bsedmatrix)
  SAFE_DEALLOCATE(bsedmt)
  SAFE_DEALLOCATE(dcck)
  SAFE_DEALLOCATE(dcckp)
  SAFE_DEALLOCATE(dvvk)
  SAFE_DEALLOCATE(dvvkp)
  SAFE_DEALLOCATE(dist_array)
  SAFE_DEALLOCATE(vcoul_array)
  SAFE_DEALLOCATE(oneoverq_array)
  SAFE_DEALLOCATE(indx)
  call dealloc_intpts()

  POP_SUB(intkernel)

  return
end subroutine intkernel

!=======================================================================================

subroutine interpolate(xct,nvdim,ncdim,bse_co,bse_fi,dcck,dcckp,dvvk,dvvkp, &
  ivin,icin,imatrix,flag)
  
  use global_m
  implicit none
  
  type (xctinfo), intent(inout) :: xct
     ! xct should not be changed but ifort 12.0.0 -O3 compiles
     ! incorrectly if it is intent(in)
  integer, intent(in) :: nvdim,ncdim
  SCALAR, intent(in) :: bse_co(:,:,:,:,:,:) !< (xct%nvb_co,xct%ncb_co,xct%nvb_co,xct%ncb_co,xct%nspin,xct%nspin)
  SCALAR, intent(out) :: bse_fi(:,:,:,:,:,:) !< (xct%nvband,xct%ncband,nvdim,ncdim,xct%nspin,xct%nspin)
  SCALAR, intent(in) :: dcck(:,:,:)  !< (xct%ncband,xct%ncb_co,xct%nspin)
  SCALAR, intent(in) :: dcckp(:,:,:) !< (xct%ncb_co,xct%ncband,xct%nspin)
  SCALAR, intent(in) :: dvvk(:,:,:)  !< (xct%nvband,xct%nvb_co,xct%nspin)
  SCALAR, intent(in) :: dvvkp(:,:,:) !< (xct%nvb_co,xct%nvband,xct%nspin)
  integer, intent(in) :: ivin, icin, imatrix, flag

  integer :: js,jsp,iv,ivp,jc,jcp,icb,ivb,jv,jvp,ic,icp, &
    js_dvvk, js_dvvkp, js_dcck, js_dcckp, bse_co_js, bse_co_jsp
  
  SCALAR, allocatable :: mat_vcvc(:,:,:,:),mat_vfvc(:,:,:,:), &
    mat_vfvf(:,:,:,:),mat_cccc(:,:,:,:), &
    mat_cfcc(:,:,:,:),mat_cfcf(:,:,:,:), &
    dummy(:,:,:,:),dummyp(:,:,:,:), &
    dummy2(:,:,:,:),dummy3(:,:,:,:), dvvkn(:,:)
  
  PUSH_SUB(interpolate)
  
  bse_fi=0.0
  
  do js=1,xct%nspin
    do jsp=1,xct%nspin
      
      if (xct%nspin .eq. 1) then
        js_dcck=1
        js_dcckp=1
        js_dvvk=1
        js_dvvkp=1
        bse_co_js=1
        bse_co_jsp=1
      elseif (flag .eq. 0 .and. imatrix .ne. 4 .and. js .eq. jsp ) then
        js_dcck=js
        js_dcckp=js
        bse_co_js=js
        if (js .eq. 1) then
          js_dvvk=js+1
          js_dvvkp=js+1   
          bse_co_jsp=js+1
        else             
          js_dvvk=js-1
          js_dvvkp=js-1                    
          bse_co_jsp=js-1
        end if
      else if((flag == 1 .and. imatrix /= 4 .and. js == jsp) .or. &
              (flag == 1 .and. imatrix == 4) .or. (flag == 2 .and. imatrix == 4)) then
        js_dcck=js
        js_dcckp=jsp
        js_dvvk=js
        js_dvvkp=jsp
        bse_co_js=js
        bse_co_jsp=jsp
      else
        cycle
      end if

! Reorder matrix

      if (xct%ipar .eq. 4) then

! Best on memory and fast if ncdim and nvdim are 1

        do icp=1,ncdim
          icb = icin
          do ivp=1,nvdim
            ivb = ivin
            do ic=1,xct%ncband
              do iv=1,xct%nvband
                do jcp=1,xct%ncb_co
                  do jvp=1,xct%nvb_co
                    do jc=1,xct%ncb_co
                      do jv=1,xct%nvb_co
                        bse_fi(iv,ic,ivp,icp,js,jsp) = bse_fi(iv,ic,ivp,icp,js,jsp) + &
                          bse_co(jv,jc,jvp,jcp,bse_co_js,bse_co_jsp) * dvvk(iv,jv,js_dvvk) &
                          * MYCONJG(dvvkp(jvp,ivb,js_dvvkp)) * &
                          MYCONJG(dcck(ic,jc,js_dcck)) * dcckp(jcp,icb,js_dcckp)
                      enddo
                    enddo
                  enddo
                enddo
              enddo
            enddo
          enddo
        enddo
        
      else if (xct%ipar .ge. 2) then

! Faster and better on memory when nvdim is not 1

        SAFE_ALLOCATE(dummy, (xct%nvb_co,xct%ncband,xct%nvb_co,ncdim))
        dummy = 0.0
              
        do icp=1,ncdim
          icb = icin
          do jcp=1,xct%ncb_co
            do jvp=1,xct%nvb_co
              call X(GEMM)('n','c',xct%nvb_co,xct%ncband,xct%ncb_co, &
                dcckp(jcp,icb,js_dcckp),bse_co(:,:,jvp,jcp,bse_co_js,bse_co_jsp), &
                xct%nvb_co,dcck(:,:,js_dcck),xct%ncband,ONE,dummy(:,:,jvp,icp),xct%nvb_co)
            enddo
          enddo
        enddo
        
        SAFE_ALLOCATE(dummyp, (xct%nvb_co,xct%nvb_co,xct%ncband,ncdim))
        SAFE_ALLOCATE(dummy2, (xct%nvb_co,xct%ncband,nvdim,ncdim))
        SAFE_ALLOCATE(dummy3, (xct%nvb_co,nvdim,xct%ncband,ncdim))
        SAFE_ALLOCATE(dvvkn, (xct%nvb_co,nvdim))
        dummy3 = 0.0
        dummy2 = 0.0
        
        do icp = 1, ncdim
          do ic = 1, xct%ncband
            do jvp = 1, xct%nvb_co
              dummyp(:,jvp,ic,icp) = dummy(:,ic,jvp,icp)
            enddo
          enddo
        enddo
        
        if (xct%ipar .eq. 2) then
          do ivp = 1,nvdim
            dvvkn(:,ivp) = MYCONJG(dvvkp(:,ivp,js_dvvkp))
          enddo
        else
          dvvkn(:,1) = MYCONJG(dvvkp(:,ivin,js_dvvkp))
        endif
        
        do icp=1,ncdim
          icb = icin
          do ic=1,xct%ncband
            call X(GEMM)('n','n',xct%nvb_co,nvdim,xct%nvb_co, &
              ONE,dummyp(1,1,ic,icp),xct%nvb_co,dvvkn(1,1),xct%nvb_co,ONE,dummy3(1,1,ic,icp),xct%nvb_co)
          enddo
        enddo
        
        do icp = 1, ncdim
          do ic = 1, xct%ncband
            dummy2(:,ic,:,icp) = dummy3(:,:,ic,icp)
          enddo
        enddo
        
        SAFE_DEALLOCATE(dummy)
        SAFE_DEALLOCATE(dummyp)
        SAFE_DEALLOCATE(dummy3)
        SAFE_DEALLOCATE(dvvkn)
        
        do icp=1,ncdim
          icb = icin
          do ivp=1,nvdim
            call X(GEMM)('n','n',xct%nvband,xct%ncband,xct%nvb_co,ONE,dvvk(:,:,js_dvvk),xct%nvband, &
              dummy2(:,:,ivp,icp),xct%nvb_co,ONE,bse_fi(:,:,ivp,icp,js,jsp),xct%nvband)
          enddo
        enddo
        
        SAFE_DEALLOCATE(dummy2)
              
      else if (xct%ipar .eq. 1) then

! Fastest but worst on memory

        SAFE_ALLOCATE(mat_vcvc, (xct%nvb_co,xct%nvb_co,xct%ncb_co,xct%ncb_co))

        do jcp=1,xct%ncb_co
          do jc=1,xct%ncb_co
            mat_vcvc(1:xct%nvb_co,1:xct%nvb_co,jc,jcp) = &
              bse_co(1:xct%nvb_co,jc,1:xct%nvb_co,jcp,bse_co_js,bse_co_jsp)
          enddo
        enddo

! Interpolate v,v`

        SAFE_ALLOCATE(mat_vfvc, (xct%nvband,xct%nvb_co,xct%ncb_co,xct%ncb_co))

        do jc=1,xct%ncb_co
          do jcp=1,xct%ncb_co
            mat_vfvc(1:xct%nvband,1:xct%nvb_co,jc,jcp) = &
              MATMUL((dvvk(1:xct%nvband,1:xct%nvb_co,js_dvvk)), &
              (mat_vcvc(1:xct%nvb_co,1:xct%nvb_co,jc,jcp)))
          enddo
        enddo

        SAFE_DEALLOCATE(mat_vcvc)
        SAFE_ALLOCATE(mat_vfvf, (xct%nvband,xct%nvband,xct%ncb_co,xct%ncb_co))

        do jc=1,xct%ncb_co
          do jcp=1,xct%ncb_co
            mat_vfvf(1:xct%nvband,1:xct%nvband,jc,jcp) = &
              MATMUL((mat_vfvc(1:xct%nvband,1:xct%nvb_co,jc,jcp)), &
              MYCONJG(dvvkp(1:xct%nvb_co,1:xct%nvband,js_dvvkp)))
          enddo
        enddo

! Reorder from v,v` to c,c`

        SAFE_DEALLOCATE(mat_vfvc)
        SAFE_ALLOCATE(mat_cccc, (xct%ncb_co,xct%ncb_co,xct%nvband,xct%nvband))

        do jcp=1,xct%ncb_co
          do jc=1,xct%ncb_co
            mat_cccc(jc,jcp,1:xct%nvband,1:xct%nvband) = &
              mat_vfvf(1:xct%nvband,1:xct%nvband,jc,jcp)
          enddo
        enddo

! Interpolate c,c`

        SAFE_DEALLOCATE(mat_vfvf)
        SAFE_ALLOCATE(mat_cfcc, (xct%ncband,xct%ncb_co,xct%nvband,xct%nvband))

        do iv=1,xct%nvband
          do ivp=1,xct%nvband
            mat_cfcc(1:xct%ncband,1:xct%ncb_co,iv,ivp) = &
              MATMUL(MYCONJG(dcck(1:xct%ncband,1:xct%ncb_co,js_dcck)), &
              (mat_cccc(1:xct%ncb_co,1:xct%ncb_co,iv,ivp)))
          enddo
        enddo

        SAFE_DEALLOCATE(mat_cccc)
        SAFE_ALLOCATE(mat_cfcf, (xct%ncband,xct%ncband,xct%nvband,xct%nvband))

        do iv=1,xct%nvband
          do ivp=1,xct%nvband
            mat_cfcf(1:xct%ncband,1:xct%ncband,iv,ivp) = &
              MATMUL((mat_cfcc(1:xct%ncband,1:xct%ncb_co,iv,ivp)), &
              (dcckp(1:xct%ncb_co,1:xct%ncband,js_dcckp)))
          enddo
        enddo

        SAFE_DEALLOCATE(mat_cfcc)

! Reorder matrix

        do ivp=1,xct%nvband
          do iv=1,xct%nvband
            bse_fi(iv,1:xct%ncband,ivp,1:xct%ncband,js,jsp) = &
              mat_cfcf(1:xct%ncband,1:xct%ncband,iv,ivp)
          enddo
        enddo

        SAFE_DEALLOCATE(mat_cfcf)

      endif
    enddo
  enddo

  POP_SUB(interpolate)

  return
end subroutine interpolate

end module intkernel_m
