!=================================================================================
!
! Routines:
!
! (1) mtxel_kernel()         Originally By MLT       Last Modified 7/9/2008 (JRD)
!
!     input: crys, gvec, syms, qg, wfnc, wfncp, wfnv, wfnvp,
!            xct types
!            ii  = label of ck block
!            ipe = label of the PE that contains the ckp block
!     output: bsedbody,bsedhead,bsedwing,bsex = kernel matrix elements
!              between ck (all v) and ckp (all vp) blocks
!
!     Calculate the head, wings, body, exchange of the kernel (see
!     eq. 34, 35, and 41-46, Rohlfing & Louie). The exchange has just
!     the proper part, no divergent contribution.
!
!=================================================================================

#include "f_defs.h"

subroutine mtxel_kernel(crys,gvec,syms,qg,wfnc,wfncp,wfnvp, &
  wfnv,xct,leading_dim,bsedbody,bsedhead,bsedwing,bsex,ii,ik,ikp, &
  ic_in,icp_in,iv_in,ivp_in,bsedbody1,bsedwing1,bsedbody2,bsedwing2, &
  vcoularray,fq,qq,g0,ifq,irq,q0len)

  use global_m
  use fftw_m
  use gmap_m
  use misc_m
  implicit none
  
  type (crystal), intent(in) :: crys
  type (gspace), intent(in) :: gvec
  type (symmetry), intent(in) :: syms
  type (grid), intent(in) :: qg
  type (wavefunction) :: wfnc,wfncp,wfnvp,wfnv
  type (xctinfo), intent(in) :: xct
  integer, intent(in) :: leading_dim
  SCALAR, intent(inout) :: &
    bsedbody(leading_dim,xct%nspin,xct%nspin), &
    bsedhead(leading_dim,xct%nspin,xct%nspin), &
    bsedwing(leading_dim,xct%nspin,xct%nspin), &
    bsex(leading_dim,xct%nspin,xct%nspin)
  integer, intent(in) :: ii,ik,ikp,ic_in,icp_in,iv_in,ivp_in
  SCALAR, intent(inout) :: &
    bsedbody1(leading_dim,xct%nspin,xct%nspin), &
    bsedwing1(leading_dim,xct%nspin,xct%nspin), &
    bsedbody2(leading_dim,xct%nspin,xct%nspin), &
    bsedwing2(leading_dim,xct%nspin,xct%nspin)
  real(DP), intent(in) :: vcoularray(xct%ng,qg%nf),fq(3),qq,q0len
  integer, intent(in) :: g0(3),ifq,irq

  integer :: ipe,iit
  character :: filename*20
  integer :: jj,sinv
  integer :: iktt,ic,icp,iv,ivp
  integer :: ig,igp,igpe,igadd,isc,iscp,isv,isvp,nfk,j
  integer :: neps,ngmax,nepsmin
  integer :: ikpt,incband,invband
  real(DP) :: ekin,fqq(3)
  real(DP) :: length,qlen,vq(3),tsec(2)
  
  integer :: ipeDumb
  integer, save :: ik_old = 0
  integer, save :: ikp_old = 0
  integer, save :: iv_old = 0
  integer, save :: ivp_old = 0
  integer, save :: ic_old = 0
  integer, save :: icp_old = 0
  logical :: ivsave, icsave, ixsave, ixpsave
  
  integer, allocatable :: isrtq(:), isrtqi(:), isrtqt(:), ind(:), identityidx(:), indinv(:)
  real(DP), allocatable :: vcoul(:)
  SCALAR :: wval,wval1,wval2,epshead
  SCALAR, allocatable :: &
    epscol(:),wptcol(:),ph(:),epscolt(:,:),epscolt2(:,:), &
    mcc(:,:,:,:),mvv(:,:,:,:),mvc(:,:,:,:),mvpcp(:,:,:,:), &
    tempw(:,:,:,:),tempb(:,:,:,:),tempw1(:,:,:,:),tempw2(:,:,:,:), &
    tempb1(:,:,:,:),tempb2(:,:,:,:),temph(:,:,:)
  SCALAR, save, allocatable :: &
    mvvold(:,:,:,:),mccold(:,:,:,:),mvcold(:,:,:,:),mvpcpold(:,:,:,:)

!------------------------------------
! Data for dynamic screening

  integer :: gdiff(3),irqt
  real(DP) :: fact,tol
  SCALAR, allocatable :: cd95(:,:),invwtildecol(:),dynfactcol(:)
  SCALAR :: wpeff2,wtilde2,I_epsggp,epsggp,rho_g_minus_gp
  
  complex(DPC), dimension(:,:,:), allocatable :: fftbox1,fftbox2
  real(DP) :: scale
  integer, allocatable :: g0mgidx(:)
  integer :: Nfft(3), g0mg(3)
  integer :: itotj
      
  PUSH_SUB(mtxel_kernel)

  if ( ik /= -1) then

    if (xct%ivpar .eq. 0) invband=xct%nvband
    if (xct%ivpar .eq. 1) invband=1

    if (xct%icpar .eq. 0) incband=xct%ncband
    if (xct%icpar .eq. 1) incband=1
    
    ivsave = .false.
    ixsave = .false.
    ixpsave = .false.
    icsave = .false.
    if (ik_old .eq. ik .and. ikp_old .eq. ikp) then
      if (iv_old .eq. iv_in .and. ic_old .eq. ic_in) then
        ixsave = .true.
!          if (peinf%inode .eq. 0) write(6,781) ii,iv_old,iv_in,ic_old,ic_in
      endif
      if (ivp_old .eq. ivp_in .and. icp_old .eq. icp_in) then
        ixpsave = .true.
!          if (peinf%inode .eq. 0) write(6,782) ii,ivp_old,ivp_in,icp_old,icp_in
      endif
      if (iv_old .eq. iv_in .and. ivp_old .eq. ivp_in) then
        ivsave = .true.
!          if (peinf%inode .eq. 0) write(6,783) ii,iv_old,iv_in,ivp_old,ivp_in
      else
        iv_old=iv_in
        ivp_old=ivp_in
      endif
      if (ic_old .eq. ic_in .and. icp_old .eq. icp_in) then
        icsave = .true.
!          if (peinf%inode .eq. 0) write(6,784) ii,ic_old,ic_in,icp_old,icp_in
      else
        ic_old=ic_in
        icp_old=icp_in
      endif
    else
      ic_old=ic_in
      icp_old=icp_in
      iv_old=iv_in
      ivp_old=ivp_in
      ik_old=ik
      ikp_old=ikp
    endif
781 format('Reusing Exchange Matrix Elements',5i6)
782 format('Reusing Exchange Matrix P Elements',5i6)
783 format('Reusing Valence Matrix Elements',5i6)
784 format('Reusing Conduction Matrix Elements',5i6)
    
    call timacc(61,1,tsec)

    sinv = 1
    if (ik.gt.ikp) sinv = -1

    SAFE_ALLOCATE(isrtq, (gvec%ng))
    SAFE_ALLOCATE(isrtqi, (gvec%ng))
    isrtq=0
    isrtqi=0

  endif
    
!-----------------------------
! Now, read the dielectric matrix at point q. The umklapp
! vector, g0, is given by: k - kp = q + g0
! JRD: All procs need to participate in the communication

  if (xct%bLowComm) then
    isrtq(:)=xct%isrtq(:,irq)
    isrtqi(:)=xct%isrtqi(:,irq)
  else
    SAFE_ALLOCATE(isrtqt, (gvec%ng))
    do irqt = 1, qg%nr
      if (peinf%inode .eq. 0) then
        isrtqt(:)=xct%isrtq(:,irqt)
      endif
#ifdef MPI
      call MPI_BCAST(isrtqt,gvec%ng,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
#endif
      if (irq .eq. irqt .and. ik /= -1) isrtq(:) = isrtqt(:)

      if (peinf%inode .eq. 0) then
        isrtqt(:)=xct%isrtqi(:,irqt)
      endif
#ifdef MPI
      call MPI_BCAST(isrtqt,gvec%ng,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
#endif
      if (irq .eq. irqt .and. ik /= -1) isrtqi(:) = isrtqt(:)
    enddo
    SAFE_DEALLOCATE(isrtqt)
  endif

  if ( ik /= -1) then

    neps=xct%nmtxa(irq)

    SAFE_ALLOCATE(wptcol, (xct%neps))
    wptcol=0.d0

    SAFE_ALLOCATE(epscol, (neps))
    SAFE_ALLOCATE(vcoul, (xct%ng))

    if (xct%dynamic_screening) then
      SAFE_ALLOCATE(dynfactcol, (xct%neps))
      dynfactcol = 0.0d0
    endif
    
    call timacc(61,2,tsec)
    
    call timacc(62,1,tsec)


!------------- Compute Vcoul -----------------------------------------------

! Compute Coulomb interaction at this q vector: vcoul(q+G)
! for q+G=0 we set the interaction to zero.

    vcoul = 0.0d0
    nfk = 0
    
    ikpt=0
    do iktt=1,qg%nf
      vq= qg%f(:,iktt)-fq(:)
      qlen = DOT_PRODUCT(vq,MATMUL(crys%bdot,vq))
      if (qlen .lt. TOL_Zero) ikpt=iktt
    enddo
    
    if (ikpt .eq. 0) then
      write(0,*) 'No match for', fq
      call die("couldn't find kpoint")
    endif
    
    vcoul(:)=vcoularray(:,ikpt)
    
    call timacc(62,2,tsec)

! End Construct Vcoul
!-------------------------------------------------------------------------

!----------------------------
! Map g-vectors of eps**(-1)(r(qq)) and calculate phases
! If G is beyond neps, set phase=0 (that is: neglect this G vector)

    call timacc(69,1,tsec)
    
    ekin= 1.0d9
    igp= 0
    SAFE_ALLOCATE(ind, (xct%neps))
    SAFE_ALLOCATE(indinv, (xct%neps))
    SAFE_ALLOCATE(ph, (xct%neps))
    ind(:)=0
    indinv(:)=0
    ph(:)=ZERO
    SAFE_ALLOCATE(identityidx, (gvec%ng))
    do ig=1,gvec%ng
      identityidx(ig)=ig
    enddo
    
    nepsmin = xct%neps
    if (neps .lt. xct%neps) then
      nepsmin = neps
    endif
    
    call gmap(gvec,syms,xct%neps,qg%itran(ifq), &
      qg%kg0(:,ifq),identityidx,isrtqi,ind,ph, &
      xct%die_outside_sphere)

! JRD: Make sure no illegal values in ind and construct indinv

    do ig=1,xct%neps
      if (ind(ig).gt.nepsmin) then
        ind(ig)= nepsmin
        ph(ig)=0.d0
!          write(0,*) peinf%inode,'WARNING!! Outside Sphere from gmap'
      endif
      if (ind(ig).eq.0) then
        ind(ig)= nepsmin
        ph(ig)=0.d0
      endif
      if (indinv(ind(ig)) .ne. 0 .and. ind(ig) .ne. nepsmin) then
        write(0,*) peinf%inode,'WARNING!! ind not 1->1!'
        write(0,*) ig,ind(ig),indinv(ind(ig)),xct%neps
      endif
      indinv(ind(ig))=ig
    enddo
    do ig=1,xct%neps
      if (indinv(ig).gt.nepsmin) then
        indinv(ig)= nepsmin
      endif
      if (indinv(ig).le.0) then
        indinv(ig)= nepsmin
      endif
    enddo

! JRD: Dumb Debugging

    SAFE_DEALLOCATE(identityidx)

    call timacc(69,2,tsec)
    call timacc(70,1,tsec)

!-------------------------------
! We need to now find the head of eps

    epshead=xct%epsdiag(ind(1),irq)

    call timacc(70,2,tsec)

!----------------------------
! If computing dynamic screening, we need the charge density in order
! calculate the Plasmon-Pole parameters.  Read from "CD95" file (used
! in GW calculation).
! Ref: http://www.nature.com/nature/journal/v471/n7337/full/nature09897.html

    if (xct%dynamic_screening) then

      call logit('         mtxel_kernel: dynamic screening setup')

! Allocate space for charge density and read from file

! dynamic screening is not working at all -- JRD
! therefore I will not bother making this work with the
! new format for RHO files here -- DAS

      call die("New format for RHO not implemented in mtxel_kernel.f90.")
      ! see Sigma/input.f90 for how to read in RHO

      SAFE_ALLOCATE(invwtildecol, (xct%neps))
      
    endif ! dynamic_screening



!-------------------------------------------------------------------------------------------
!--------- Compute direct term matrix elements ( <c'k'|e^{i(G0-G).r|ck> ) etc --------------



! If g0(:) is non-zero, the size of mcc,mvv must increase: umklapp vector

    call timacc(65,1,tsec)

    call logit('         mtxel_kernel: direct term mats')
    fqq(:) = dble(g0)
    length = DOT_PRODUCT(fqq,MATMUL(crys%bdot,fqq))
    if(length.ne.0) then
      length = length + xct%ecutg
      do ig=1,gvec%ng
        if (length.lt.gvec%ekin(ig)) exit
      enddo
      ngmax = ig - 1
    else
      ngmax = xct%ng
    endif
    
    if (xct%icpar .eq. 1) then
      SAFE_ALLOCATE(mcc, (ngmax,1,1,xct%nspin))
    else
      SAFE_ALLOCATE(mcc, (ngmax,xct%ncband,xct%ncband,xct%nspin))
    endif
    
    if (xct%ivpar .eq. 1) then
      SAFE_ALLOCATE(mvv, (ngmax,1,1,xct%nspin))
    else
      SAFE_ALLOCATE(mvv, (ngmax,xct%nvband,xct%nvband,xct%nspin))
    endif
    
    if (.not. ivsave .and. xct%ivpar .eq. 1) then
      if (ii .ne. 1) then
        SAFE_DEALLOCATE(mvvold)
      end if
      SAFE_ALLOCATE(mvvold, (ngmax,1,1,xct%nspin))
    endif
    if (.not. icsave .and. xct%icpar .eq. 1) then
      if (ii .ne. 1) then
        SAFE_DEALLOCATE(mccold)
      end if
      SAFE_ALLOCATE(mccold, (ngmax,1,1,xct%nspin))
    endif
    mcc=0.0d0
    mvv=0.0d0

! Compute size of FFT box we need

    call setup_FFT_sizes(gvec%kmax,Nfft,scale)

! Allocate FFT boxes

    SAFE_ALLOCATE(fftbox1, (Nfft(1),Nfft(2),Nfft(3)))
    SAFE_ALLOCATE(fftbox2, (Nfft(1),Nfft(2),Nfft(3)))

! We calculate <c'k'|e^{i(G0-G).r|ck>, so we need an
! index table of G0-G in the list of g vectors
! note: g0mgidx stores 0 beyond screened_coulomb_cutoff;
! these values should not be used in the FFT work

    SAFE_ALLOCATE(g0mgidx, (gvec%ng))
    g0mgidx(:) = 0
    do ig=1,ngmax
      g0mg(:) = sinv*( g0(:)-gvec%k(:,ig) )
      call findvector(g0mgidx(ig),g0mg(1),g0mg(2),g0mg(3),gvec)
      if (g0mgidx(ig) == 0) call die('cannot find g0-g')
    enddo
    
! Compute matrix elements: <c'k'|exp(i(kp-k-G+G0).r)|ck>

    if (.not. icsave .or. xct%icpar .eq. 0) then
      do isc=1,xct%nspin
        do icp=1,incband
          call put_into_fftbox(wfncp%ng,wfncp%cg(1:,icp,isc),gvec%ng,gvec%k,wfncp%isort,fftbox1,Nfft)
          call do_FFT(fftbox1,Nfft,1)
          call conjg_fftbox(fftbox1,Nfft)
          do ic=1,incband
            call put_into_fftbox(wfnc%ng,wfnc%cg(1:,ic,isc),gvec%ng,gvec%k,wfnc%isort,fftbox2,Nfft)
            call do_FFT(fftbox2,Nfft,1)
            call multiply_fftboxes(fftbox1,fftbox2,Nfft)
            call do_FFT(fftbox2,Nfft,1)
            call get_from_fftbox(ngmax,mcc(1:,ic,icp,isc),gvec%ng,gvec%k,g0mgidx,fftbox2,Nfft,scale)
          enddo
        enddo
      enddo!isc
      if (xct%icpar .eq. 1) mccold(:,:,:,:)=mcc(:,:,:,:)
    else
      mcc(:,:,:,:)=mccold(:,:,:,:)
    endif

! Compute matrix element: <v'k'|exp(i(kp-k-G+G0).r)|vk>

    if (.not. ivsave .or. xct%ivpar .eq. 0) then
      do isv=1,xct%nspin
        do ivp=1,invband
          call put_into_fftbox(wfnvp%ng,wfnvp%cg(1:,ivp,isv),gvec%ng,gvec%k,wfnvp%isort,fftbox1,Nfft)
          call do_FFT(fftbox1,Nfft,1)
          call conjg_fftbox(fftbox1,Nfft)
          do iv=1,invband
            call put_into_fftbox(wfnv%ng,wfnv%cg(1:,iv,isv),gvec%ng,gvec%k,wfnv%isort,fftbox2,Nfft)
            call do_FFT(fftbox2,Nfft,1)
            call multiply_fftboxes(fftbox1,fftbox2,Nfft)
            call do_FFT(fftbox2,Nfft,1)
            call get_from_fftbox(ngmax,mvv(1:,iv,ivp,isv),gvec%ng,gvec%k,g0mgidx,fftbox2,Nfft,scale)
          enddo
        enddo
      enddo !isv
      if ( xct%ivpar .eq. 1) mvvold(:,:,:,:)=mvv(:,:,:,:)
    else
      mvv(:,:,:,:)=mvvold(:,:,:,:)
    endif

! Done calculating mcc and mvv

    SAFE_DEALLOCATE(g0mgidx)

    call timacc(65,2,tsec)



! End computation of independent matrix elements.
!-------------------------------------------------------------------------------------------
!-------------------------------------------------------------------------------------------



!----------- Compute head, wings, body (n.b.: omitted multiplicative factor) ---------------

! This part has been rewritten to accelerate it by a factor xct%neps:
! schematically, the direct term is given by
!
! bsed(ic,iv,icp,ivp) = sum(ig,igp) { Mccp(ig) * W(ig,igp) * Mvvp(igp)
!
! so we loop over (iv,ivp,isv), and for each such set take Mvvp(:)
! and multiply by W(ig,:) to get temp(ig).  We then loop over
! (ic,icp,isc) and sum over ig and accumulate Mccp(ig)*temp(ig).
! So the inner loop has one loop over ig instead of a pair as before.
! The only cost is the storage of vector temp(:), which is tiny.
!
! The reason things look messy is that (1) we have to worry about
! head/wing/body parts separately, and (2) we may have dynamic
! corrections.
!
! The code below assumes that xct%neps<=xct%ng, which is true for
! any sane case I can think of.

    call timacc(61,1,tsec)

    call logit('         mtxel_kernel: head-wings-body')

!#ifdef VERBOSE
!      if (peinf%inode .eq. 0) then
!        write(6,*) peinf%inode,'*** VERBOSE:          Allocating tempb'
!        write(6,*) peinf%inode,'*** VERBOSE:          ',xct%ng* &
!        xct%nvband*xct%nspin*xct%nvband
!      endif
!#endif

    if (xct%ivpar .eq. 1) then
      SAFE_ALLOCATE(temph, (1,1,xct%nspin))
      SAFE_ALLOCATE(tempw, (xct%ng,1,1,xct%nspin))
      SAFE_ALLOCATE(tempb, (xct%ng,1,1,xct%nspin))
      if (xct%dynamic_screening) then
        SAFE_ALLOCATE(tempw1, (xct%ng,1,1,xct%nspin))
        SAFE_ALLOCATE(tempw2, (xct%ng,1,1,xct%nspin))
        SAFE_ALLOCATE(tempb1, (xct%ng,1,1,xct%nspin))
        SAFE_ALLOCATE(tempb2, (xct%ng,1,1,xct%nspin))
      endif
    else
      SAFE_ALLOCATE(temph, (xct%nvband,xct%nvband,xct%nspin))
      SAFE_ALLOCATE(tempw, (xct%ng,xct%nvband,xct%nvband,xct%nspin))
      SAFE_ALLOCATE(tempb, (xct%ng,xct%nvband,xct%nvband,xct%nspin))
      if (xct%dynamic_screening) then
        SAFE_ALLOCATE(tempw1, (xct%ng,xct%nvband,xct%nvband,xct%nspin))
        SAFE_ALLOCATE(tempw2, (xct%ng,xct%nvband,xct%nvband,xct%nspin))
        SAFE_ALLOCATE(tempb1, (xct%ng,xct%nvband,xct%nvband,xct%nspin))
        SAFE_ALLOCATE(tempb2, (xct%ng,xct%nvband,xct%nvband,xct%nspin))
      endif
    endif

! Loop over ig,igp where we have screening and calculate temporaries.

    temph(:,:,:) = 0.0d0
    tempw(:,:,:,:) = 0.0d0
    tempb(:,:,:,:) = 0.0d0
    if (xct%dynamic_screening) then
      tempw1(:,:,:,:) = 0.0d0
      tempw2(:,:,:,:) = 0.0d0
      tempb1(:,:,:,:) = 0.0d0
      tempb2(:,:,:,:) = 0.0d0
    endif


!--------------------------------------------------------------------------------
!---------------------- igp loop ------------------------------------------------


    fact = 16.0d0 * PI_D / crys%celvol
    tol = TOL_zero

    call logit('         mtxel_kernel: computing W(g,gp)')
    
    call timacc(61,2,tsec)
    
! JRD Open EPS File

    call timacc(72,1,tsec)
    
    if (xct%iwriteint .eq. 0) then
      if (irq.le.10000) then
        write(filename,'(a,i4.4)') 'INT_EPS_',irq-1
      endif
      call open_file(17,file=filename,form='unformatted',status='old')
    endif
    
    call timacc(72,2,tsec)
    
  endif ! ik /= -1
  
  call timacc(77,1,tsec)
  
  if (xct%iwriteint .ne. 0) then
    SAFE_ALLOCATE(epscolt, (xct%nmtxmax,xct%maxpet))
    SAFE_ALLOCATE(epscolt2, (xct%nmtxmax,xct%maxpet))
    epscolt=0
    epscolt2=0
  endif
  
  call timacc(77,2,tsec)
  
  itotj=0
  
  do ipe = 1, peinf%npes

    !if (peinf%inode .eq. 0) write(6,*) "ipe loop", ipe

    ipeDumb = ipe
    if (ipe .eq. 1) ipeDumb = peinf%npes+1

!--------------------------
! JRD: Actually read in eps

    if (xct%iwriteint .ne. 0) then

      call timacc(74,1,tsec)

      if ( xct%bLowComm ) then

        epscolt = 0D0
        jj=1
        irqt = irq

        if (irqt .ne. 0) then
          do while ( (peinf%npes*(jj-1)+ipeDumb-1) .le. xct%nmtxmax )
            epscolt(:,jj)=xct%epscol(:,peinf%npes*(jj-1)+ipeDumb-1,irqt)      
            jj = jj + 1
          enddo
        endif

      else

        do irqt = 1, qg%nr

          if ((ipe -1) .eq. peinf%inode) then
            epscolt2(:,:)=xct%epscol(:,:,irqt)
          endif
 
#ifdef MPI
          call MPI_BCAST(epscolt2,xct%nmtxmax*xct%maxpet,MPI_SCALAR,ipe-1,MPI_COMM_WORLD,mpierr)
#endif
          if (irq .eq. irqt) then
            epscolt=epscolt2
          endif
        enddo

      endif
      call timacc(74,2,tsec)
    
    endif
  
    ! JRD: If we have no work to do. Just cycle here.
    if (ik .eq. -1) cycle

    igpe_loop: do igpe = 1, xct%maxpe(ipe)
      
      call timacc(71,1,tsec)
      
      itotj = itotj + 1
      
      if (xct%iwriteint .eq. 1) then
        igp = xct%epsowni(igpe,ipe)
      else
        igp = itotj
      endif
      
      epscol=0d0
      wptcol=0d0
        
      if (xct%dynamic_screening) then
        invwtildecol=0d0
        dynfactcol=0d0
      endif
      
      if (xct%iwriteint .eq. 0) then
        if (igp .le. nepsmin) then
          read(17) epscol(1:neps)
        else
          cycle igpe_loop
        endif
      else
        if (igp .le. nepsmin) then
          epscol(1:neps)=epscolt(1:neps,igpe)
        else
          cycle igpe_loop
        endif
      endif
      
      call timacc(71,2,tsec)


!-----------------------------
! Calculate Plasmon-Pole data

      if (xct%dynamic_screening) then

! Tolerance for "zero" values

        do ig=1,xct%neps


! Get rho(g-gp)

          gdiff(:)=gvec%k(:,ig)-gvec%k(:,indinv(igp))
          call findvector(igadd,gdiff(1),gdiff(2),gdiff(3),gvec)
          if (igadd==0) cycle
          rho_g_minus_gp = sum(cd95(igadd,:))

! Calc effective plasma frequency (quantity in formula (31)
! on page 5396 of Hybertsen & Louie, PRB vol 34, (1986)
! given by (where wp^2=16*pi*rho(0))
! wpeff2 = wp^2*[rho(G-G')/rho(0)]*(q+G).(q+G')*vc(q+G)/(8pi)
! if q+g<>0, then just the formula

          if (ig.ne.1 .or. qg%indr(ifq).ne.1) then
            wpeff2 = fact * rho_g_minus_gp * &
              dot_product(gvec%k(:,indinv(igp))+fq(:), &
              matmul(crys%bdot,gvec%k(:,ig)+fq(:))) * vcoul(ig)
          else

! If gp=0 and q+g=0, then wp^2 or 0 (depending)

            if (indinv(igp) .eq. 1) then
              wpeff2 = fact*rho_g_minus_gp
              if (xct%icutv.ne.0) wpeff2 = 0.0d0

! if gp<>0, then just zero

            else
              wpeff2 = 0.0d0
            endif
          endif
          
! Get static inverse dielectric entry into epsggp
! and I-eps(g,gp)

          epsggp = ph(ig)*MYCONJG(ph(indinv(igp)))*epscol(ind(ig))
          if (ig .eq. indinv(igp)) then
            I_epsggp = 1.0d0 - epsggp
          else
            I_epsggp = -epsggp
          endif

! Skip if too small...

          if (abs(I_epsggp)<tol) cycle
          if (abs(wpeff2)<tol) cycle

! Compute mode frequency squared wtilde2 (Eq. (30) on
! page 5396 of Hybertsen and Louie).  GPP model has poles
! at wtilde and -wtilde.

          wtilde2 = wpeff2 / I_epsggp
          invwtildecol(ig) = 1.0d0/sqrt(wtilde2)

        enddo !ig
      endif ! Dynamic screening

!-----------------------------------------
! Compute W(g,gp) for q + g0 =kp-k

! JRD: What we actually interpolate is only the head matrix elements (i.e. excluding
! the 1/q^2 factor), the wing part that does not diverge (i.e. multiplied by |q|), and the
! complete body element.  If we are truncating, instead of multiplying the wings by |q|
! we divide them noting that in this case the wing is DEFINED as being
! some smooth function multiplied by |q|*Vtrunc(G=0,q)*eps^-1(G=0,G`=0,q)
! This is because the wings of the dielectric matrix are /propto |q| and pick up a factor
! of eps^-1(G=0,G`=0,q) during the inversion process (see Epsilon/epsinv.f90).
! We include this factor here and not above because eps^-1(G=0,G`=0,q) varies quicker in truncated case.
! The Vtrunc(G=0,q) factor comes from the bare (truncated) coulomb interaction.

! The dynfact matrix holds the difference W-vc which is used in the
! plasmon-pole dynamical fit [also equals (eps-I)*vc]

      call timacc(63,1,tsec)

! JRD: Dumb Debugging

!        write(6,*) peinf%inode,'Calcing wpt'

!-------------------------------------
! Head for q=0 and q<>0, Interpolate different things depending on
! what screening/truncation we use.  See wiki.

      if (indinv(igp) .eq. 1) then

! Semiconductor Screening

        if (xct%iscreen .eq. 0) then
          wptcol(1)= 1.0d0

! Graphene Screening

        elseif (xct%iscreen .eq. 1) then
          if (xct%icutv .eq. 0) then
            if ( qq .gt. Tol_Zero) then
              wptcol(1)=vcoul(1)*epscol(ind(1))*qq
            else
              wptcol(1)=vcoul(1)*epscol(ind(1))*q0len
           endif
          else
            wptcol(1)=1.0d0
          endif
          
! Metal Screening

        else
          wptcol(1)=vcoul(1)*epscol(ind(1))
        endif

!-----------------------------------
! First wing (i.e. (g<>0,0)).  See comments from first wing below.

        do ig=2,xct%neps

! Semiconductor Screening

          if (xct%iscreen .eq. 0) then

! --No Truncation

            wptcol(ig)=ph(ig)*MYCONJG(ph(1)) &
              *epscol(ind(ig))*vcoul(1)*qq

! --With Truncation

            if (xct%icutv .ne. 0) then
              wptcol(ig)=ph(ig)*MYCONJG(ph(1)) &
                *epscol(ind(ig))*vcoul(1)
            end if

! --SemiCond - Zero q=0 wings

            if ( qq .lt. Tol_Zero) then
              wptcol(ig)=0D0
            endif

! Metal/Graphene Screening

          else
            wptcol(ig)=ph(ig)*MYCONJG(ph(1))*epscol(ind(ig)) &
              *vcoul(1)
            
! --Graphene Wire - Zero q=0 wings

            if (xct%iscreen .eq. 1 .and. xct%icutv .eq. 4 &
              .and. qq .lt. TOL_Zero) then
              wptcol(ig)=0d0
            end if
              
          end if
            
          if (xct%dynamic_screening) dynfactcol(ig) = wptcol(ig)
            
        enddo
          
      else

!-------------------------------------
!  Second wing (i.e. (0,gp<>0)).  We zero semicond wings for q->0.  See
!  the readme.

! Semiconductor Screening

        if (xct%iscreen .eq. 0) then

! --No Truncation
            
          wptcol(1)=ph(1)*MYCONJG(ph(indinv(igp)))* &
            epscol(ind(1))*vcoul(indinv(igp))*qq

! --With Truncation

          if (xct%icutv .ne. 0) then
            wptcol(1)=ph(1)*MYCONJG(ph(indinv(igp)))* &
              epscol(ind(1))*vcoul(indinv(igp))
          endif

! --SemiCond - Zero q=0 wings

          if ( qq .lt. Tol_Zero) then
            wptcol(1)=0D0
          endif

! Metal/Graphene Screening

        else
          wptcol(1)=ph(1)*MYCONJG(ph(indinv(igp)))* &
            epscol(ind(1))*vcoul(indinv(igp))

! --Graphene Wire - Zero q=0 wings

          if (xct%iscreen.eq.1 .and. xct%icutv.eq.4 &
            .and. qq .lt. TOL_Zero) then
            wptcol(1)=0d0
          end if

        endif

        if (xct%dynamic_screening) dynfactcol(1) = wptcol(1)

!----------------------------------------
!  Body (i.e. (g<>0,gp<>0))
!  Does not depend on truncation or screening.

        do ig=2,xct%neps
          wptcol(ig)=ph(ig)*MYCONJG(ph(indinv(igp))) * &
            epscol(ind(ig))*vcoul(indinv(igp))

! JRD: Dumb Debugging

!            if (abs(vcoul(indinv(igp))) .gt. 10D8) then
!              write(0,*) 'WARNING: Large Vcoul', ig, igp,
!     >         indinv(igp),vcoul(indinv(igp)),xct%neps,neps
!            endif
!            if (abs(epscol(ind(ig))) .gt. 10D8) then
!              write(0,*) 'WARNING: Large eps', ig,
!     >         ind(ig),epscol(ind(ig)),xct%neps,neps
!            endif

          if (xct%dynamic_screening) then
            dynfactcol(ig) = wptcol(ig)
            if (ig==igp) dynfactcol(ig) = wptcol(ig) - &
              vcoul(indinv(igp))
          endif
        enddo
        
      endif

! JRD: Dumb Debugging

!        write(6,*) peinf%inode,'Calced wpt'

!------------------------------------------
! Take complex conjugate of epsinv if ik.gt.ikp

      if (sinv.eq.-1) wptcol = MYCONJG(wptcol)
      
      call timacc(63,2,tsec)

!---------------------------------
! ig loop

      call timacc(64,1,tsec)
      
      do iv=1,invband
        do ivp=1,invband
          do isv=1,xct%nspin
            
            do ig=1,xct%neps
              
              wval = wptcol(ig)
              
              if (xct%dynamic_screening) then
                wval1 = dynfactcol(ig)*invwtildecol(ig)
                wval2 = wval1*invwtildecol(ig)
              endif
              
! Head

              if (ig==1 .and. indinv(igp)==1) then
                temph(iv,ivp,isv) = wval*mvv(1,iv,ivp,isv)

! Wing1

              elseif (ig==1) then
                tempw(1,iv,ivp,isv) = tempw(1,iv,ivp,isv) + &
                  wval*mvv(indinv(igp),iv,ivp,isv)
                if (xct%dynamic_screening) then
                  tempw1(1,iv,ivp,isv) = tempw1(1,iv,ivp,isv) + &
                    wval1*mvv(indinv(igp),iv,ivp,isv)
                  tempw2(1,iv,ivp,isv) = tempw2(1,iv,ivp,isv) + &
                    wval2*mvv(indinv(igp),iv,ivp,isv)
                endif

! Wing2

              elseif (indinv(igp)==1) then
                tempw(ig,iv,ivp,isv) = wval*mvv(1,iv,ivp,isv)
                if (xct%dynamic_screening) then
                  tempw1(ig,iv,ivp,isv) = wval1*mvv(1,iv,ivp,isv)
                  tempw2(ig,iv,ivp,isv) = wval2*mvv(1,iv,ivp,isv)
                endif

! Body

              else
                tempb(ig,iv,ivp,isv) = tempb(ig,iv,ivp,isv) + &
                  wval*mvv(indinv(igp),iv,ivp,isv)
                if (xct%dynamic_screening) then
                  tempb1(ig,iv,ivp,isv) = tempb1(ig,iv,ivp,isv) + &
                    wval1*mvv(indinv(igp),iv,ivp,isv)
                  tempb2(ig,iv,ivp,isv) = tempb2(ig,iv,ivp,isv) + &
                    wval2*mvv(indinv(igp),iv,ivp,isv)
                endif
              endif
            enddo
          enddo
        enddo
      enddo
      
      call timacc(64,2,tsec)
      
    enddo igpe_loop !igpe
  enddo !ipe

  if (ik .eq. -1) then
    SAFE_DEALLOCATE(epscolt)
    SAFE_DEALLOCATE(epscolt2)
    POP_SUB(mtxel_kernel)
    return
  endif

  if (xct%iwriteint .eq. 0) call close_file(17)
  
  call timacc(66,1,tsec)


! Add in contribution from unscreened coulomb at high G vectors


  do iv=1,invband
    do ivp=1,invband
      do isv=1,xct%nspin
        
        do ig=1,xct%neps
          if (abs(ph(ig)) .lt. TOL_Zero) then
            wval = vcoul(ig)
            tempb(ig,iv,ivp,isv) = tempb(ig,iv,ivp,isv) + &
              wval*mvv(ig,iv,ivp,isv)
          endif
        enddo
        
        do ig=xct%neps+1,xct%ng
          wval = vcoul(ig)
          tempb(ig,iv,ivp,isv) = tempb(ig,iv,ivp,isv) + &
            wval*mvv(ig,iv,ivp,isv)
        enddo

! Now loop over the (ic,icp,isc) to calculate the direct
! elements using temporaries.

        do ic = 1, incband
          do icp = 1, incband
            do isc=1,xct%nspin
              
              if (xct%icpar .eq. 0) then
                iit = peinf%wown(1,1,ikp,1,1,ik) + xct%nvband*xct%nvband*xct%ncband*(icp-1) &
                  + xct%nvband*xct%nvband*(ic-1) + xct%nvband*(ivp-1) + iv - 1
              else if (xct%ivpar .eq. 0) then 
                iit = peinf%wown(1,icp_in,ikp,1,ic_in,ik) + xct%nvband*(ivp-1) + iv -1
              else

! JRD: is this correct?

!                    iit = ii
                iit = peinf%wown(ivp_in,icp_in,ikp,iv_in,ic_in,ik)
              endif
            
 
              if (iit .gt. leading_dim) then
                write(0,*) peinf%inode, ik, ikp
                write(0,*) peinf%wown(1,1,ikp,1,1,ik), ii, iit
                write(0,*) peinf%nckpe, peinf%myown, leading_dim
                call die("Internal error in mtxel_kernel with array dimensions.")
              endif
! Head

              ig = 1
              bsedhead(iit,isc,isv) = bsedhead(iit,isc,isv) + &
                MYCONJG(mcc(ig,ic,icp,isc)) * temph(iv,ivp,isv)

! Wing and Body

              do ig=1,xct%ng
                bsedwing(iit,isc,isv) = bsedwing(iit,isc,isv) + &
                  MYCONJG(mcc(ig,ic,icp,isc)) * tempw(ig,iv,ivp,isv)
                bsedbody(iit,isc,isv) = bsedbody(iit,isc,isv) + &
                  MYCONJG(mcc(ig,ic,icp,isc)) * tempb(ig,iv,ivp,isv)
                
                if (xct%dynamic_screening) then
                  bsedwing1(iit,isc,isv) = bsedwing1(iit,isc,isv) + &
                    MYCONJG(mcc(ig,ic,icp,isc)) * tempw1(ig,iv,ivp,isv)
                  bsedwing2(iit,isc,isv) = bsedwing2(iit,isc,isv) + &
                    MYCONJG(mcc(ig,ic,icp,isc)) * tempw2(ig,iv,ivp,isv)
                  bsedbody1(iit,isc,isv) = bsedbody1(iit,isc,isv) + &
                    MYCONJG(mcc(ig,ic,icp,isc)) * tempb1(ig,iv,icp,isv)
                  bsedbody2(iit,isc,isv) = bsedbody2(iit,isc,isv) + &
                    MYCONJG(mcc(ig,ic,icp,isc)) * tempb2(ig,iv,ivp,isv)
                endif
              enddo !ig
            enddo !isc
          enddo
        enddo
      enddo !isv
    enddo
  enddo
  
  call timacc(66,2,tsec)
  
  SAFE_DEALLOCATE(temph)
  SAFE_DEALLOCATE(tempw)
  SAFE_DEALLOCATE(tempb)
  
  if (xct%dynamic_screening) then
    SAFE_DEALLOCATE(tempw1)
    SAFE_DEALLOCATE(tempw2)
    SAFE_DEALLOCATE(tempb1)
    SAFE_DEALLOCATE(tempb2)
  endif
  
  SAFE_DEALLOCATE(vcoul)
  SAFE_DEALLOCATE(mcc)
  SAFE_DEALLOCATE(mvv)
  SAFE_DEALLOCATE(epscol)
  if (xct%iwriteint .eq. 1) then
    SAFE_DEALLOCATE(epscolt)
    SAFE_DEALLOCATE(epscolt2)
  end if
  SAFE_DEALLOCATE(wptcol)
  SAFE_DEALLOCATE(ind)
  SAFE_DEALLOCATE(indinv)
  SAFE_DEALLOCATE(ph)
  SAFE_DEALLOCATE(isrtq)
  SAFE_DEALLOCATE(isrtqi)
  if (xct%dynamic_screening) then
    SAFE_DEALLOCATE(invwtildecol)
    SAFE_DEALLOCATE(dynfactcol)
    SAFE_DEALLOCATE(cd95)
  endif
  
  call timacc(67,1,tsec)

!------------------------------------
! Compute exchange term matrices

  call logit('         mtxel_kernel: X term matrices')
    
  SAFE_ALLOCATE(mvc, (xct%ng,invband,incband,xct%nspin))
  SAFE_ALLOCATE(mvpcp, (xct%ng,invband,incband,xct%nspin))
  
  if (.not. ixsave .and. xct%ivpar .eq. 1 .and. xct%icpar .eq. 1) then
    if (ii .ne. 1) then
      SAFE_DEALLOCATE(mvcold)
    end if
    SAFE_ALLOCATE(mvcold, (xct%ng,1,1,xct%nspin))
  endif
  if (.not. ixpsave .and. xct%ivpar .eq. 1 .and. xct%icpar .eq. 1) then
    if (ii .ne. 1) then
      SAFE_DEALLOCATE(mvpcpold)
    end if
    SAFE_ALLOCATE(mvpcpold, (xct%ng,1,1,xct%nspin))
  endif
  
  mvc = 0.0d0
  mvpcp = 0.0d0

! Use FFTs to calculate the matrix elements

! We need an index table that is just the identity

  SAFE_ALLOCATE(identityidx, (gvec%ng))
  do j=1,gvec%ng
    identityidx(j)=j
  enddo
  
! Compute matrix elements: <vk|e^(i*G.r)|ck>


! JRD: Dumb Debugging

!      write(6,*) peinf%inode, 'Doing Exchange fft'

  if (.not. ixsave .or. xct%icpar .eq. 0 .or. xct%ivpar .eq. 0) then
    do isc=1,xct%nspin
      do iv=1,invband
        isv=isc
        call put_into_fftbox(wfnv%ng,wfnv%cg(1:,iv,isv),gvec%ng,gvec%k,wfnv%isort,fftbox1,Nfft)
        call do_FFT(fftbox1,Nfft,1)
        call conjg_fftbox(fftbox1,Nfft)
        do ic = 1, incband
          call put_into_fftbox(wfnc%ng,wfnc%cg(1:,ic,isc),gvec%ng,gvec%k,wfnc%isort,fftbox2,Nfft)
          call do_FFT(fftbox2,Nfft,1)
          call multiply_fftboxes(fftbox1,fftbox2,Nfft)
          call do_FFT(fftbox2,Nfft,1)
          call get_from_fftbox(xct%ng,mvc(1:,iv,ic,isc),gvec%ng,gvec%k,identityidx,fftbox2,Nfft,scale)

! We actually do not want the G=0 component in the
! exchange, so set its matrix element to zero.

          mvc(1,iv,ic,isc) = 0.0
        enddo
      enddo
    enddo!isc
    if (xct%icpar .eq. 1 .and. xct%ivpar .eq. 1) mvcold(:,:,:,:)=mvc(:,:,:,:)
  else
    mvc(:,:,:,:)=mvcold(:,:,:,:)
  endif

! Compute matrix elements: <vpkp|e^(i*G.r)|cpkp>

! JRD: Dumb Debugging

!      write(6,*) peinf%inode, 'Doing Exchange fft2'

  if (.not. ixpsave .or. xct%icpar .eq. 0 .or. xct%ivpar .eq. 0) then
    do iscp=1,xct%nspin
      do iv=1,invband
        isvp=iscp
        call put_into_fftbox(wfnvp%ng,wfnvp%cg(1:,iv,isvp),gvec%ng,gvec%k,wfnvp%isort,fftbox1,Nfft)
        call do_FFT(fftbox1,Nfft,1)
        call conjg_fftbox(fftbox1,Nfft)
        do ic=1,incband
          call put_into_fftbox(wfncp%ng,wfncp%cg(1:,ic,iscp),gvec%ng,gvec%k,wfncp%isort,fftbox2,Nfft)
          call do_FFT(fftbox2,Nfft,1)
          call multiply_fftboxes(fftbox1,fftbox2,Nfft)
          call do_FFT(fftbox2,Nfft,1)
          call get_from_fftbox(xct%ng,mvpcp(1:,iv,ic,iscp),gvec%ng,gvec%k,identityidx,fftbox2,Nfft,scale)

! We actually do not want the G=0 component in the
! exchange, so set its matrix element to zero.

          mvpcp(1,iv,ic,iscp) = 0.0
        enddo
      enddo
    enddo!iscp
    if (xct%icpar .eq. 1 .and. xct%ivpar .eq. 1) mvpcpold(:,:,:,:)=mvpcp(:,:,:,:)
  else
    mvpcp(:,:,:,:)=mvpcpold(:,:,:,:)
  endif
  
! JRD: Dumb Debugging

!      write(6,*) peinf%inode, 'Xchange Done ffts'

  SAFE_DEALLOCATE(identityidx)
  SAFE_DEALLOCATE(fftbox1)
  SAFE_DEALLOCATE(fftbox2)

! JRD: Dumb Debugging

!      write(6,*) peinf%inode, 'Deallocated fftboxes'

! Done Calculating Matrix Elements Used in Exchange
!---------------------------------

  call timacc(67,2,tsec)
  call timacc(68,1,tsec)

!----------------------------------
! Compute bsex

  call logit('         mtxel_kernel: computing bsex')

! Calc. coulomb interaction for q=0

  SAFE_ALLOCATE(vcoul, (xct%ng))
  vcoul = 0.0d0

  ikpt=0
  do iktt=1,qg%nf
    vq(:) = qg%f(:,iktt)
    qlen = DOT_PRODUCT(vq,MATMUL(crys%bdot,vq))
    if (qlen .lt. TOL_Zero) ikpt=iktt
  enddo
  
  if (ikpt .eq. 0) then
    write(0,*) 'No match for q=0'
    call die("couldn't find kpoint")
  endif
  
  vcoul(:)=vcoularray(:,ikpt)
  
  do ic=1,incband
    do icp=1,incband
      do iv=1,invband
        do ivp=1,invband
          
          if (xct%icpar .eq. 0) then
            iit = peinf%wown(1,1,ikp,1,1,ik) + xct%nvband*xct%nvband*xct%ncband*(icp-1) &
              + xct%nvband*xct%nvband*(ic-1) + xct%nvband*(ivp-1) + iv -1
          else if (xct%ivpar .eq. 0) then
            iit = peinf%wown(1,icp_in,ikp,1,ic_in,ik) + xct%nvband*(ivp-1) + iv -1
          else
            iit = ii
          endif
          
          do isc=1,xct%nspin
            do iscp=1,xct%nspin
              do ig=2,xct%ng
                bsex(iit,isc,iscp)= &
                  bsex(iit,isc,iscp)+vcoul(ig)* &
                  mvpcp(ig,ivp,icp,iscp)*MYCONJG(mvc(ig,iv,ic,isc))
              enddo      !ig
            enddo        !iscp
          enddo          !isc
        enddo
      enddo
    enddo
  enddo
  
  call logit('         mtxel_kernel: done bsex')
  
  call timacc(68,2,tsec)
  
  SAFE_DEALLOCATE(vcoul)
  SAFE_DEALLOCATE(mvc)
  SAFE_DEALLOCATE(mvpcp)
  
  POP_SUB(mtxel_kernel)
  
  return
end subroutine mtxel_kernel
