!================================================================================
!
! Module:
!
! (1) mtxel_sxch()          Originally By ?         Last Modified 10/5/2009 (gsm)
!
!     Computes the sum (for information on current processor) of the
!     screened exchange and Coulomb hole (excluding bare exchange) for
!     the self-energy operator between bands n (sig%diag(in)) and m
!     (sig%diag(im)) at the frequency of the band l (sig%diag(il)).
!
!     Screened exchange is put into asxt(iw), Coulomb hole into acht(iw),
!     iw goes from 1 to 3 (see below for what it is).
!
!     On entry, asxt(:) and acht(:) are zeroed.
!
!     Physically, this routine will compute:
!
!     < psi_n ( k ) | Sigma_SX ( E ) + Sigma_CH ( E ) | psi_m ( k ) >
!
!     where E = E_l ( k ) + ( iw - 2 ) * sig%dw and iw = 1, 2, 3
!     so we evaluate the matrix elements of Sigma at three different
!     energies, E_l ( k ) - sig%dw, E_l ( k ), and E_l ( k ) + sig%dw.
!
!     Sigma(E) is computed within different approximations:
!     - static COHSEX approximation, both exact and partial sum CH (gsm)
!     - Generalized Plasmon Pole model (?)
!     - full frequency dependent inverse dielectric matrix (CDS/CHP/gsm)
!
!     If sig%ggpsum=1 we compute a half of the sum over G and G` vectors
!     (igp=1,ncouls and ig=1,igp) and take the complex conjugate for the
!     other half. This is only valid if we use the same truncation scheme
!     in both Epsilon and Sigma. If sig%ggpsum=2 we compute the full sum.
!
!     ch2 variables are used to calculate the CH part of the imaginary part
!     of the self energy assuming a zero energy broadening in the energy
!     denominator for the evaluation of the self energy (CHP).
!
!================================================================================

#include "f_defs.h"

module mtxel_sxch_m

  use global_m
  use misc_m

  implicit none

  private

  public :: mtxel_sxch

contains

subroutine mtxel_sxch(in,il,ispin,ncouls,neps,gvec,eps,ph,ind, &
  indinv,isrtrqi,isrtrq,vcoul,crys,sig,wpg,wfnk,wfnkq,ncoulch, &
  aqsn,aqsm,aqsch,asigt_imag,acht_n1,asxt,acht,achtcor,nspin,qk, &
  coulfact,inv_igp_index,ngpown,epsR,epsA, &
  achtD_n1,asxtDyn,achtDyn,achtDyn_cor,achtDyn_corb,ach2tDyn,icalc)

  integer, intent(in) :: in,il,ispin,ncouls,neps
  type (gspace), intent(in) :: gvec
  !> (neps,ngpown) Uninitialized unless we are running the complex version
  SCALAR, pointer, intent(in) :: eps(:,:)
  SCALAR, intent(in) :: ph(:) !< (gvec%ng)
  integer, intent(in) :: ind(:), indinv(:), isrtrq(:) !< (gvec%ng)
  !> (gvec%ng) Uninitialized unless freq_dep=0 .or. exact_ch=1
  integer, pointer, intent(in) :: isrtrqi(:)
  real(DP), intent(in) :: vcoul(:) !< (ncoul)
  type (crystal), intent(in) :: crys
  type (siginfo), intent(in) :: sig
  type (wpgen), intent(in) :: wpg
  type (wfnkstates), intent(in) :: wfnk
  type (wfnkqstates), intent(in) :: wfnkq
  integer, intent(in) :: ncoulch
  SCALAR, intent(in) :: aqsn(:,:), aqsm(:,:) !< (peinf%ntband_max,ncoul)
  !> (ncoulch) Uninitialized unless freq_dep=0 .or. exact_ch=1
  SCALAR, pointer, intent(in) :: aqsch(:)
  !> (sig%ntband) Uninitialized in FF calculations
  SCALAR, pointer, intent(out) :: acht_n1(:)
  !> (3 or sig%nfreqeval) Uninitialized unless freq_dep=1
  SCALAR, pointer, intent(out) :: asxt(:), acht(:)
  SCALAR, intent(out) :: achtcor
  SCALAR, intent(out) :: asigt_imag
  integer, intent(in) :: nspin
  real(DP), intent(in) :: qk(:) !< (3)
  real(DP), intent(in) :: coulfact
  integer, intent(in) :: inv_igp_index(:) !< (neps)
  integer, intent(in) :: ngpown
  !> The following pointers are uninitialized unless we are running the complex version
  complex(DPC), pointer, intent(in) :: epsR(:,:,:),epsA(:,:,:) !< (sig%nFreq,neps,ngpown)
  complex(DPC), pointer, intent(out) :: achtD_n1(:) !< (sig%ntband)
  complex(DPC), pointer, intent(out) :: asxtDyn(:), achtDyn(:), achtDyn_cor(:), ach2tDyn(:) !< (sig%nfreqeval)
  complex(DPC), pointer, intent(out) :: achtDyn_corb(:) !< (sig%nfreqeval)
  integer, intent(in) :: icalc

  SCALAR, allocatable :: epstemp(:), aqsntemp(:,:),aqsmtemp(:,:)
  complex(DPC), allocatable :: epsRtemp(:,:),epsAtemp(:,:)
  SCALAR :: acht_n1_loc(sig%ntband)


  integer :: ijk, my_igp, indigp, ipe, j
  integer*8 :: cycle_count, cycle_count_total, ncouls2
  real(DP) :: qkk(3),diff,diffmin,limitone,limittwo
  logical :: flag_occ
  integer :: ig,igp,igpp,igpp2,iw,iwlda,n1,iband,n1true,nstart,nend,igmax,gpp(3)
  real(DP) :: wx,ssxcutoff,delw2,occ,occfact,tempval
  real(DP), allocatable :: wx_array(:)
  real(DP) :: fact1,fact2,wxt,rden
! chs - partial sum static CH, chx - exact static CH
  SCALAR :: achstemp,schstemp,achxtemp,ssx,sch,ssxt,scht, &
    schs,schx,matngmatmgp,matngpmatmg,epsggp,I_epsggp,achxtemp_gp, &
    Omega2,wtilde2,asxtemp_loc,achtemp_loc, &
    mygpvar1,mygpvar2
  SCALAR, allocatable :: asxtemp(:),achtemp(:),ssx_array(:),sch_array(:)
  SCALAR, allocatable :: wpmtx(:),I_eps_array(:,:)
  real(DP) :: e_lk, e_n1kq, lambda, phi
  complex(DPC) :: wtilde,halfinvwtilde,wtilde2_temp,delw,wdiff,cden
  complex(DPC), allocatable :: wtilde_array(:,:)
  integer :: iout, my_id

! full-frequency

  integer :: ifreq
  real(DP) :: E_max, pref_zb
  real(DP), allocatable :: pref(:)
  real(DP) :: delwr,wdiffr
  complex(DPC) :: schD,achsDtemp,schsDtemp
  complex(DPC), allocatable :: matngmatmgpD(:,:),matngpmatmgD(:,:)
  complex(DPC), allocatable :: asxDtemp(:),achDtemp(:),ach2Dtemp(:),achDtemp_cor(:)
  complex(DPC), allocatable :: achDtemp_corb(:)
  complex(DPC), allocatable :: schDi(:),schDi_cor(:),schDi_corb(:)
  complex(DPC), allocatable :: sch2Di(:)
  complex(DPC), allocatable :: ssxDi(:)
  complex(DPC) :: ssxDit,ssxDitt,ssxDittt,schDt,schDtt,sch2dt,sch2dtt
  complex(DPC), allocatable :: schDt_array(:)
  complex(DPC) :: schDttt,schDttt_cor
  complex(DPC) :: schDt_avg, schDt_right, schDt_left, schDt_lin, schDt_lin2, schDt_lin3
  real(DP), allocatable :: wxi(:)
  real(DP) :: cedifft_zb,intfact,cedifft_zb_left,cedifft_zb_right
  complex(DPC) :: cedifft_coh,cedifft_cor
  complex(DPC), allocatable :: epsRggp(:),epsAggp(:),I_epsRggp(:),I_epsAggp(:)
  complex(DPC), allocatable :: I_epsR_array(:,:,:),I_epsA_array(:,:,:)
  complex(DPC) :: I_epsRggp_int, I_epsAggp_int
  
  PUSH_SUB(mtxel_sxch)

  if(sig%freq_dep .eq. -1) then
    call die("BUG: cannot call mtxel_sxch for Hartree-Fock!")
  endif

  limitone=1D0/(TOL_Small*4D0)
  limittwo=sig%gamma**2

! Initialize Output Arrays
! SIB: Zero contribution to asx(n) and ach(n) for this irq

  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
    asxt(:) = ZERO
    acht(:) = ZERO
    acht_n1(:) = ZERO
    acht_n1_loc(:) = ZERO
  elseif (sig%freq_dep.eq.2) then
    asxtDyn(:) = (0.0d0,0.0d0)
    achtDyn(:) = (0.0d0,0.0d0)
    achtDyn_cor(:) = (0.0d0,0.0d0)
    achtDyn_corb(:) = (0.0d0,0.0d0)
    ach2tDyn(:) = (0.0d0,0.0d0)
    achtD_n1(:) = (0.0d0,0.0d0)
  endif
  achtcor = ZERO
  asigt_imag = ZERO

  if (peinf%my_pool .eq. -1) then
    POP_SUB(mtxel_sxch)
    return
  endif

  call timacc(41,1)

!-----------------------
! Initialization for full-frequency CH integral

  if (sig%freq_dep.eq.2) then
    SAFE_ALLOCATE(pref, (sig%nFreq))
#ifdef CPLX
    pref_zb = 0.5D0 / PI_D
#else
    pref_zb = 1D0 / PI_D
#endif
    do ifreq=1,sig%nFreq
      if (ifreq .lt. sig%nFreq) then
        pref(ifreq)=(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))/PI_D
      else
        pref(ifreq)=pref(ifreq-1)
      endif
    enddo
    pref(1)=pref(1)*0.5d0
    pref(sig%nFreq)=pref(sig%nFreq)*0.5d0
#ifdef CPLX
    do ifreq=1,sig%nFreq
      pref(ifreq)=pref(ifreq)*0.5d0
    enddo
#endif
    E_max=sig%dFreqGrid(sig%nFreq)
    SAFE_ALLOCATE(asxDtemp, (sig%nfreqeval))
    asxDtemp = 0D0
    SAFE_ALLOCATE(achDtemp, (sig%nfreqeval))
    achDtemp = 0D0
    SAFE_ALLOCATE(achDtemp_cor, (sig%nfreqeval))
    achDtemp_cor = 0D0
    SAFE_ALLOCATE(achDtemp_corb, (sig%nfreqeval))
    achDtemp_corb = 0D0
    SAFE_ALLOCATE(ach2Dtemp, (sig%nfreqeval))
    ach2Dtemp = 0D0
    SAFE_ALLOCATE(schDi, (sig%nfreqeval))
    schDi=0D0
    SAFE_ALLOCATE(schDi_cor, (sig%nfreqeval))
    schDi_cor=0D0
    SAFE_ALLOCATE(schDi_corb, (sig%nfreqeval))
    schDi_corb=0D0
    SAFE_ALLOCATE(sch2Di, (sig%nfreqeval))
    sch2Di=0D0
    SAFE_ALLOCATE(ssxDi, (sig%nfreqeval))
    ssxDi=0D0
    SAFE_ALLOCATE(wxi, (sig%nfreqeval))
    wxi=0D0
    SAFE_ALLOCATE(I_epsR_array, (ncouls,ngpown,sig%nFreq))
    SAFE_ALLOCATE(I_epsA_array, (ncouls,ngpown,sig%nFreq))
    !SAFE_ALLOCATE(I_epsR_array, (sig%nFreq,ncouls,ngpown))
    !SAFE_ALLOCATE(I_epsA_array, (sig%nFreq,ncouls,ngpown))
    SAFE_ALLOCATE(matngmatmgpD, (ncouls,ngpown))
    SAFE_ALLOCATE(matngpmatmgD, (ncouls,ngpown))
  else if (sig%freq_dep.eq.1) then
    SAFE_ALLOCATE(I_eps_array, (ncouls,ngpown))
    SAFE_ALLOCATE(wtilde_array, (ncouls,ngpown))
    call timacc(45,1)
!$OMP PARALLEL DO private(j)
    do j = 1, ngpown
      I_eps_array(:,j)=ZERO
      wtilde_array(:,j)=(0D0,0D0)
    enddo
!$OMP END PARALLEL DO
    call timacc(45,2)
  else if (sig%freq_dep.eq.3) then
    SAFE_ALLOCATE(I_eps_array, (ncouls,ngpown))
    SAFE_ALLOCATE(wtilde_array, (ncouls,ngpown))
    I_eps_array(:,:)=ZERO
    wtilde_array(:,:)=(0D0,0D0)
  else
    SAFE_ALLOCATE(I_eps_array, (ncouls,ngpown))
  endif

! SIB: the array wfnk%ek has bands indexed by sig%diag
! take the right band index

  iband = sig%diag(il)
  e_lk = wfnk%ek(iband,ispin)

! JRD: Initial frequencies for plasmon pole case.
! Note: below assumes that we will use iw=1,2,3 in
! subroutine shiftenergy depending on value of sig%fdf

  if (sig%freq_dep.eq.1 .or. sig%freq_dep.eq.0.or.sig%freq_dep.eq.3) then
    if (sig%fdf.eq.-3) then
      SAFE_ALLOCATE(asxtemp,(sig%nfreqeval))
      SAFE_ALLOCATE(achtemp,(sig%nfreqeval))
    else
      SAFE_ALLOCATE(asxtemp,(3))
      SAFE_ALLOCATE(achtemp,(3))
    endif
  endif

  if (sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
    if (sig%fdf.eq.-3) then
      nstart=1
      nend=sig%nfreqeval
      SAFE_ALLOCATE(wxi, (sig%nfreqeval))
      SAFE_ALLOCATE(wx_array, (sig%nfreqeval))
      do iw=1,sig%nfreqeval
        wx = sig%freqevalmin + (iw-1)*sig%freqevalstep
        wxi(iw) = wx
      enddo
    elseif (sig%fdf.eq.-1) then
      nstart = 1
      nend = 2
      SAFE_ALLOCATE(wx_array, (3))
    elseif (sig%fdf.eq.0) then
      nstart = 1
      nend = 3
      SAFE_ALLOCATE(wx_array, (3))
    elseif (sig%fdf.eq.1.or.(sig%fdf.eq.2.and.icalc.eq.1)) then
      nstart = 2
      nend = 3
      SAFE_ALLOCATE(wx_array, (3))
    else
      nstart = 2
      nend = 2
      SAFE_ALLOCATE(wx_array, (3))
    endif
  endif

  call timacc(41,2)

!------------------------
! JRD: This changes qkk to q0vec for q=0 instead of using exactly zero

#ifdef VERBOSE
  qkk=qk
  diff = abs(qkk(1)) + abs(qkk(2)) + abs(qkk(3))
  if (diff.lt.TOL_Zero) then
    qkk(:) = sig%q0vec(:)
    if (in .eq. 1 .and. il .eq. 1) then ! no need to write again for off-diagonals
      if (peinf%inode.eq.0) then
        write(7,998) qkk
      endif
    endif
  endif
998 format(/,6x,"q0vec =",3f7.4)
#endif

!-------------- Main loop over G` (igp) ---------------------------------------

  call timacc(42,1)

  cycle_count = 0
  achxtemp = ZERO

! In below OMP region:
! I_eps_array, I_epsR_array, I_epsA_array, wtilde_array are shared

!$OMP PARALLEL private (ig,epsggp,I_epsggp,gpp,iout,schx,igpp,igpp2,achxtemp_gp,igp,indigp, &
!$OMP                      igmax,epstemp,I_epsRggp,epsRggp,epsRtemp, &
#ifdef CPLX
!$OMP                      epsAtemp,wtilde2_temp,lambda,phi,epsAggp,I_epsAggp, &
#endif
!$OMP                      wpmtx,wtilde,wtilde2,Omega2)

#ifdef OMP
  my_id = omp_get_thread_num()
#else
  my_id = 0
#endif

! Allocate Temporary Arrays

  select case(sig%freq_dep)
  case(0)
    SAFE_ALLOCATE(epstemp, (neps))
  case(1)
    SAFE_ALLOCATE(epstemp, (neps))
    SAFE_ALLOCATE(wpmtx, (neps))
  case(2)
    SAFE_ALLOCATE(epsRtemp, (sig%nFreq,neps))
    SAFE_ALLOCATE(epsRggp, (sig%nFreq))
    SAFE_ALLOCATE(I_epsRggp, (sig%nFreq))
#ifdef CPLX
    SAFE_ALLOCATE(epsAtemp, (sig%nFreq,neps))
    SAFE_ALLOCATE(epsAggp, (sig%nFreq))
    SAFE_ALLOCATE(I_epsAggp, (sig%nFreq))
#endif
  case(3)
    SAFE_ALLOCATE(epsRtemp, (sig%nFreq,neps))
    SAFE_ALLOCATE(epsRggp, (sig%nFreq))
    SAFE_ALLOCATE(I_epsRggp, (sig%nFreq))
  end select

!$OMP DO reduction(+:achxtemp,cycle_count)
  do my_igp=1,ngpown

    indigp = inv_igp_index(my_igp)
    igp = indinv(indigp)

    if (igp .gt. ncouls) write(6,*) "CATASTROPHE", peinf%inode, my_igp, igp
    if (igp .gt. ncouls .or. igp .le. 0) cycle

! JRD: We will soon remove ggpsum 1 for all freq_dep

    if (sig%ggpsum.eq.1) then
      igmax=igp
    else
      igmax=ncouls
    endif
    
    if ( sig%freq_dep .eq. 2) igmax = ncouls

!!------------- Initialize eps^-1 for this G` ---------------------------------

    select case(sig%freq_dep)
    case(0,1)
      epstemp(:)=eps(:,my_igp)
    case(2)
      epsRtemp(:,:)=epsR(:,:,my_igp)
#ifdef CPLX
      epsAtemp(:,:)=epsA(:,:,my_igp)
#endif
    case(3)
      epsRtemp(:,:)=epsR(:,:,my_igp)
    end select

!------------------------------------------------------------------------------
! (gsm) Below you`ll find the code to compute SX & CH in the static COHSEX
!       approximation (both the exact and partial sum expressions for CH),
!       within the GPP model, and using the full frequency dependent RPA
!       inverse dielectric matrix. The GPP section of the code is well
!       commented (thanks to Sohrab), while the COHSEX and RPA sections
!       are not... But most of the variables are the same anyways,
!       so just look at the comments in the GPP section.
!------------------------------------------------------------------------------

! (gsm) <<<<<< static COHSEX approximation - exact static CH >>>>>>

! < n k | \Sigma_{CH} (r, r`; 0) | m k > =
! \frac{1}{2} \sum_{q G G`}
! < n k | e^{i (G - G`) \cdot r} | m k >
! [\eps_{G G`}^{-1} (q; 0) - \delta_{G G`}] v (q + G`)

    if (sig%freq_dep.eq.0.or.sig%exact_ch.eq.1) then
    
! Only Computed on One Processor Per Pool

      ! JRD: Since, we are distributed over igp now, all procs need to do this

      achxtemp_gp = ZERO

      do ig=1,igmax

        if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
          epsggp = ph(ig)*MYCONJG(ph(igp))*epstemp(ind(ig))
        else
          epsggp = ph(ig)*MYCONJG(ph(igp))*SCALARIFY(epsRtemp(1,ind(ig)))
        endif
        if (ig.eq.igp) then
          I_epsggp = epsggp - 1.0d0
        else
          I_epsggp = epsggp
        endif
        if (abs(I_epsggp).lt.TOL_Small) cycle
        if (ig.ne.igp) then
          gpp(:)=gvec%components(:,isrtrq(ig))-gvec%components(:,isrtrq(igp))
          call findvector(iout,gpp,gvec)
          if (iout.eq.0) cycle
          igpp=isrtrqi(iout)
          if (igpp.lt.1.or.igpp.gt.ncoulch) cycle
          gpp(:)=gvec%components(:,isrtrq(igp))-gvec%components(:,isrtrq(ig))
          call findvector(iout,gpp,gvec)
          if (iout.eq.0) cycle
          igpp2=isrtrqi(iout)
          if (igpp2.lt.1.or.igpp2.gt.ncoulch) cycle
        else
          gpp(:)=0
          call findvector(iout,gpp,gvec)
          if (iout.eq.0) cycle
          igpp=isrtrqi(iout)
          if (igpp.lt.1.or.igpp.gt.ncoulch) cycle
        endif
        schx = aqsch(igpp) * I_epsggp
! XXX: TEMP Just turn off ggpsum1 for full_freq. Soon delete for all.
        if (sig%ggpsum.eq.1.and.ig.lt.igp.and.sig%freq_dep.ne.2) then
          schx = schx + aqsch(igpp2) * MYCONJG(I_epsggp)
        endif
        achxtemp_gp = achxtemp_gp + schx 
      enddo ! over G (ig)

      achxtemp = achxtemp + achxtemp_gp * vcoul(igp) * 0.5d0

    endif ! sig%freq_dep.eq.0.or.sig%exact_ch.eq.1

!!-----------------------------------------------------------------------------------------
! (gsm) <<<<<< static COHSEX approximation - CH as a partial sum over empty bands >>>>>>

! < n k | \Sigma_{SX} (r, r`; 0) | m k > =
! - \sum_{n_1}^{occ} \sum_{q G G`}
! < n k | e^{i (q + G) \cdot r} | n_1 k - q >
! < n_1 k - q | e^{- i (q + G`) \cdot r`} | m k >
! [\eps_{G G`}^{-1} (q; 0) - \delta_{G G`}] v (q + G`)

! < n k | \Sigma_{CH} (r, r`; 0) | m k > =
! \frac{1}{2} \sum_{n_1} \sum_{q G G`}
! < n k | e^{i (q + G) \cdot r} | n_1 k - q >
! < n_1 k - q | e^{- i (q + G`) \cdot r`} | m k >
! [\eps_{G G`}^{-1} (q; 0) - \delta_{G G`}] v (q + G`)


    if (sig%freq_dep.eq.0) then

      do ig=1,igmax
        epsggp = ph(ig)*MYCONJG(ph(igp))*epstemp(ind(ig))
        if (ig.eq.igp) then
          I_epsggp = epsggp - 1.0d0
        else
          I_epsggp = epsggp
        endif

        I_eps_array(ig,my_igp) = I_epsggp

      enddo ! over G (ig)

! (gsm) <<<<<< Generalized Plasmon Pole model >>>>>>

    elseif (sig%freq_dep.eq.1) then

! Zero out temporary accumulation variables

!----------------------
! Calculate Plasma Frequencies
!
! SIB: Here we get the plasmon-pole effective plasma frequecies
! Omega(G,G`)^2 (equation (31) of Hybersten & Louie, PRB 34, 1986, p 5396)
! which come in the vector wp(G) for current G`.

! SIB: We calculate wp(G) for a given G` (trade-off for memory)
! Note that wp(G) G=1,ncouls requires O(N) operations
! Even if we redo it for each band n, not so bad

! JRD: I changed this to use qk instead of qkk because we use vcoul at q=0
! (instead of small q) throughout


! given how many times this routine is called, timing it appears to take a non-negligible amount of time

      call wpeff(crys,gvec,wpg,sig,neps,isrtrq,igp,ncouls,wpmtx,nspin,qk,vcoul,coulfact)

!!------ For given G` (igp), loop over all G vectors (ig) in lower triangle ---

      if (my_id.eq.0) call timacc(46,1)

      do ig=1,igmax

! Put epsilon(G,G`) into epsggp

        epsggp = ph(ig)*MYCONJG(ph(igp))*epstemp(ind(ig))

! I_epsggp = Kronecker(G,G`) - eps(G,G`)

        if (ig.eq.igp) then
          I_epsggp = ONE - epsggp
        else
          I_epsggp = ZERO - epsggp
        endif

! If I_epsggp is too small, then we skip this (G,G`) entry
! This only happens when eps is 1 on diagonal or 0 off diagonal
! but, this means no screening correct and is already treated properly in bare
! exchange term

        if (abs(I_epsggp).lt.TOL_Small) cycle

        I_eps_array(ig,my_igp) = I_epsggp

! Omega2 = Omega(G,G`)^2 = effective plasma freq^2

        Omega2 = wpmtx(ig)

! If Omega2 is too small, then we skip this (G,G`) entry
! JRD: I am not sure why we have to cycle here... :/ Probably not needed

        if (abs(Omega2).lt.TOL_Small) cycle

#ifdef CPLX

! <<<<<<<<<<<< COMPLEX GPP >>>>>>>>>>>>

! (gsm) equations (17), (20), (21) from [PRB 40, 3162 (1989)]

        wtilde2_temp = Omega2 / I_epsggp
 
        lambda = abs(wtilde2_temp)
        if (lambda .lt. TOL_Small) cycle
        phi = atan2(IMAG(wtilde2_temp), dble(wtilde2_temp))
        if (abs(cos(phi)) .lt. TOL_Small) cycle
        wtilde2 = lambda / cos(phi)
! this is not needed because we recalculate Omega2 below
!        Omega2 = Omega2 * CMPLX(1.0d0, -tan(phi))

#else

! <<<<<<<<<<<< REAL GPP >>>>>>>>>>>>

! (gsm) equation (30) from [PRB 34, 5390 (1986)]

        wtilde2 = Omega2 / I_epsggp

        if (abs(wtilde2) .lt. TOL_Small) cycle

#endif

! gsm & jrd: find "complex" mode frequency wtilde

        wtilde = sqrt(COMPLEXIFY(wtilde2))

! BEGIN OLD GPP
        !if (ig.eq.igp) then
        !  if (dble(wtilde2_temp).lt.TOL_Small) cycle
        !else
        !  if (abs(wtilde2_temp).lt.TOL_Small) cycle
        !  if (dble(wtilde2_temp)/abs(wtilde2_temp).lt.TOL_Small) cycle
        !endif
        !wtilde = sqrt(wtilde2_temp)
! END OLD GPP

        wtilde_array(ig,my_igp) = wtilde

      enddo ! G Loop for GPP Setup

      if (my_id.eq.0) call timacc(46,2)



!!--------------------------------------------------------------------
! (gsm) <<<<<< full frequency dependent inverse dielectric matrix >>>>>>

! The code below makes use of the following relations:
!
! {eps_{G G`}^r}^{-1}(q, -E) = {eps_{G G`}^a}^{-1}(q, E)
! for general systems (both real and complex versions of the code)
!
! {eps_{G G`}^a}^{-1}(q, E) = {{eps_{G G`}^r}^{-1}}^{*}(q, E)
! for systems with inversion symmetry (the real version of the code)
! since plane-wave matrix elements are real
!
! {eps_{G G}^a}^{-1}(q, E) = {{eps_{G G}^r}^{-1}}^{*}(q, E)
! for general systems, the diagonal of the matrix (G` = G)
! since plane-wave matrix elements are complex conjugates of each other
!
! The complex version of the code uses
! {eps_{G G`}^r}^{-1}(q, E) and {eps_{G G`}^a}^{-1}(q, E) for E >= 0
!
! The real version of the code uses
! {eps_{G G`}^r}^{-1}(q, E) for E >= 0

! CHP: full frequency routine - the case for sig%ggpsum == 1
!
! On top of the above relations, we need the following:
! Assuming that W_{G,G`}={eps_{G,G`}}^{-1} v(q+G`),
!
! W_{G`,G}^r(E) = {W_{G,G`}^a}^{*}(E)
!               = {W_{G,G`}^r}^{*}(-E) (this form is used if E<0)
! for general systems (both real and complex version of the code)
!
! W_{G`,G}^a(E) = {W_{G,G`}^r}^{*}(E)
!               = {W_{G,G`}^a}^{*}(-E) (this form is used if E<0)
! for general systems (both real and complex version of the code)
!
! W_{G`,G}^r(E) = W_{G,G`}^r(E)
! for systems with inversion symmetry
!
! W_{G`,G}^a(E) = W_{G,G`}^a(E)
! for systems with inversion symmetry
!
! Note that eps^{-1} does not have these symmetries.

    elseif (sig%freq_dep.eq.2) then

! JRD: This may be slow - due to reordering of memory
      do ig=1,igmax
        
        epsRggp(:) = ph(ig)*MYCONJG(ph(igp))*epsRtemp(:,ind(ig))
#ifdef CPLX
        epsAggp(:) = ph(ig)*MYCONJG(ph(igp))*epsAtemp(:,ind(ig))
#endif
        
        if (ig.eq.igp) then
          I_epsRggp = 1.0d0 - epsRggp
#ifdef CPLX
          I_epsAggp = 1.0d0 - epsAggp
#endif
        else
          I_epsRggp = - epsRggp
#ifdef CPLX
          I_epsAggp = - epsAggp
#endif
        endif

! XXX - This could be bad with threads. Cachelines could be conflicting.
! Seems OK in practice so far.
        I_epsR_array(ig,my_igp,:) = I_epsRggp(:)
        !I_epsR_array(:,ig,my_igp) = I_epsRggp(:)
#ifdef CPLX
        I_epsA_array(ig,my_igp,:) = I_epsAggp(:)
        !I_epsA_array(:,ig,my_igp) = I_epsAggp(:)
#endif

      enddo
      
    endif ! sig%freq_dep

  enddo ! over G` (igp)
!$OMP END DO



  if (sig%freq_dep.eq.0) then
    SAFE_DEALLOCATE(epstemp)
  endif
  if (sig%freq_dep.eq.1) then
    SAFE_DEALLOCATE(wpmtx)
    SAFE_DEALLOCATE(epstemp)
  endif
  if (sig%freq_dep.eq.2) then
    SAFE_DEALLOCATE(epsRtemp)
    SAFE_DEALLOCATE(epsRggp)
    SAFE_DEALLOCATE(I_epsRggp)
#ifdef CPLX
    SAFE_DEALLOCATE(epsAtemp)
    SAFE_DEALLOCATE(epsAggp)
    SAFE_DEALLOCATE(I_epsAggp)
#endif
  endif
  if (sig%freq_dep.eq.3) then
    SAFE_DEALLOCATE(epsRtemp)
    SAFE_DEALLOCATE(epsRggp)
    SAFE_DEALLOCATE(I_epsRggp)
  endif

!$OMP END PARALLEL

  call timacc(42,2)

  !if (peinf%inode .eq. 0) write(6,*) "Done Setup"

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!!!!!!!!!!!! END SETUP / BEGIN COMPUTATION OF SUMS !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

! JRD Reduce static cohsex version?

  if (sig%freq_dep .eq. 0) then

    asxtemp(:) = ZERO
    achtemp(:) = ZERO

    SAFE_ALLOCATE(aqsntemp,(ncouls,peinf%ntband_max))
    SAFE_ALLOCATE(aqsmtemp,(ncouls,peinf%ntband_max))

! JRD LOOP OVER NPES PER POOL

    do ipe = 1, peinf%npes_pool

      call timacc(43,1)

      if (peinf%pool_rank .eq. ipe-1) then
        aqsntemp(:,:) = aqsn(1:ncouls,:)
        aqsmtemp(:,:) = aqsm(1:ncouls,:)
      endif

! JRD: broadcast in just pool

#ifdef MPI
      if (peinf%my_pool .ne. -1) then
        call MPI_Bcast(aqsntemp,peinf%ntband_max*ncouls,MPI_SCALAR,ipe-1,peinf%pool_comm,mpierr)
        call MPI_Bcast(aqsmtemp,peinf%ntband_max*ncouls,MPI_SCALAR,ipe-1,peinf%pool_comm,mpierr)
      endif
#endif

      call timacc(43,2)

      do n1=1,peinf%ntband_dist(ipe)

! n1true = "True" band index of the band n1 w.r.t. all bands

        call timacc(51,1)

        n1true = peinf%indext_dist(n1,ipe)

! energy of the |n1,k-q> state

        e_n1kq = wfnkq%ekq(n1true,ispin)

        !if (peinf%inode .eq. 0) write(6,*) 'ipe,n1,n1true',ipe,n1,n1true,e_n1kq

! occupation of the |n1,k-q> state

        flag_occ = (n1true.le.(sig%nvband+sig%ncrit)) &
          .and.((sig%ncrit.eq.0).or.(e_n1kq.le.(sig%efermi+sig%tol)))
          
        tempval=abs(e_n1kq-sig%efermi)
        if (tempval .lt. sig%tol) then
          occ = 0.5d0 ! Fermi-Dirac distribution = 1/2 at Fermi level
        else
          occ = 1.0d0
        endif

        call timacc(51,2)

        call timacc(52,1)

!$OMP PARALLEL do private (my_igp,igp,indigp,igmax,mygpvar1,mygpvar2,ig,ssx,sch, &
!$OMP                       matngmatmgp,matngpmatmg,schstemp,achtemp_loc, &
!$OMP                       asxtemp_loc) reduction(+:achtemp,asxtemp,acht_n1_loc)
        do my_igp = 1, ngpown
          indigp = inv_igp_index(my_igp)
          igp = indinv(indigp)

          if (igp .gt. ncouls .or. igp .le. 0) cycle

          if (sig%ggpsum.eq.1) then
            igmax=igp
          else
            igmax=ncouls
          endif

          schstemp = ZERO
          achtemp_loc = ZERO
          asxtemp_loc = ZERO

          do ig = 1, igmax

            ! Cycle bad for vectorization
            !if (abs(I_eps_array(ig,my_igp)).lt.TOL_Small) cycle
            ssx = I_eps_array(ig,my_igp)

            matngmatmgp = aqsntemp(ig,n1) * MYCONJG(aqsmtemp(igp,n1))
            if (sig%ggpsum.eq.1) &
              matngpmatmg = aqsntemp(igp,n1) * MYCONJG(aqsmtemp(ig,n1))
          
            if(sig%exact_ch == 0) sch = ssx * 0.5d0

            if (sig%ggpsum.eq.1.and.ig.lt.igp) then
              ssx = matngmatmgp*ssx + matngpmatmg*MYCONJG(ssx)
              if(sig%exact_ch == 0) then 
                sch = matngmatmgp*sch + matngpmatmg*MYCONJG(sch)
                achtemp_loc = achtemp_loc + sch
              endif
            else
              ssx = matngmatmgp*ssx
              if(sig%exact_ch == 0) then 
                sch = matngmatmgp*sch
                achtemp_loc = achtemp_loc + sch
              endif
            endif

            if (flag_occ) then
              asxtemp_loc = asxtemp_loc - ssx
            endif

          enddo ! ig
          asxtemp(2) = asxtemp(2) + asxtemp_loc*occ*vcoul(igp)          
          if(sig%exact_ch == 0) achtemp(2) = achtemp(2) + achtemp_loc*vcoul(igp)
          if(sig%exact_ch == 0) acht_n1_loc(n1true) = acht_n1_loc(n1true) + achtemp_loc*vcoul(igp)
        enddo ! my_igp
        call timacc(52,2)
      enddo ! over bands (n1)
    enddo ! over ipe

    if (sig%exact_ch.eq.1) then
      achtemp(2)=achxtemp
      achxtemp=ZERO
    endif

    asxt(:) = asxt(:) + asxtemp(2)
    acht(:) = acht(:) + achtemp(2)

    if (sig%exact_ch .eq. 0) then
      achtcor = achtcor + (achxtemp - achtemp(2))
    endif

    SAFE_DEALLOCATE(I_eps_array)

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

  else if (sig%freq_dep .eq. 1 .or. sig%freq_dep.eq.3) then

! Some constants used in the loop below, computed here to save
! floating point operations

!!!----- Loop over all bands n1 on curr. processor -----

    asxtemp(:) = ZERO
    achtemp(:) = ZERO
    if (sig%exact_ch.eq.1) achstemp = ZERO

    SAFE_ALLOCATE(aqsntemp,(ncouls,peinf%ntband_max))
    SAFE_ALLOCATE(aqsmtemp,(ncouls,peinf%ntband_max))

! JRD NOW LOOP OVER NPES PER POOL

    do ipe = 1, peinf%npes_pool

      call timacc(43,1)

      if (peinf%pool_rank .eq. ipe-1) then
        aqsntemp(:,:) = aqsn(1:ncouls,:)
        aqsmtemp(:,:) = aqsm(1:ncouls,:)
      endif

! JRD BCAST in just pool

#ifdef MPI
      if (peinf%my_pool .ne. -1) then
        call MPI_Bcast(aqsntemp,peinf%ntband_max*ncouls,MPI_SCALAR,ipe-1,peinf%pool_comm,mpierr)
        call MPI_Bcast(aqsmtemp,peinf%ntband_max*ncouls,MPI_SCALAR,ipe-1,peinf%pool_comm,mpierr)
      endif
#endif

      call timacc(43,2)

      do n1=1,peinf%ntband_dist(ipe)

! n1true = "True" band index of the band n1 w.r.t. all bands

        call timacc(51,1)

        n1true = peinf%indext_dist(n1,ipe)

! energy of the |n1,k-q> state

        e_n1kq = wfnkq%ekq(n1true,ispin)

        !if (peinf%inode .eq. 0) write(6,*) 'ipe,n1,n1true',ipe,n1,n1true,e_n1kq

! occupation of the |n1,k-q> state

        flag_occ = (n1true.le.(sig%nvband+sig%ncrit)) &
          .and.((sig%ncrit.eq.0).or.(e_n1kq.le.(sig%efermi+sig%tol)))
          
        tempval=abs(e_n1kq-sig%efermi)
        if (tempval .lt. sig%tol) then
          occ = 0.5d0 ! Fermi-Dirac distribution = 1/2 at Fermi level
        else
          occ = 1.0d0
        endif
          
! (gsm) compute the static CH for the static remainder

        if (sig%exact_ch.eq.1) then
!$OMP PARALLEL do private (my_igp,igp,indigp,igmax,mygpvar1,mygpvar2,ig,schs, &
!$OMP                       matngmatmgp,matngpmatmg,schstemp) reduction(+:achstemp)
          do my_igp = 1, ngpown
            indigp = inv_igp_index(my_igp)
            igp = indinv(indigp)

            if (igp .gt. ncouls .or. igp .le. 0) cycle

            if (sig%ggpsum.eq.1) then
              igmax=igp
            else
              igmax=ncouls
            endif

            mygpvar1 = MYCONJG(aqsmtemp(igp,n1))
            mygpvar2 = aqsntemp(igp,n1)

            schstemp = ZERO

! We do two loops here for performance. Don`t want to evaluate if statements inside loop 
! at every iteration

            if (sig%ggpsum.eq.1) then
              do ig = 1, igmax - 1
                schs=-I_eps_array(ig,my_igp)
! JRD: Cycle bad for vectorization. 
! I_eps_array is already set to zero above for these ig,igp
!                if (abs(schs).lt.TOL_Small) cycle
                matngmatmgp = aqsntemp(ig,n1) * mygpvar1
                matngpmatmg = MYCONJG(aqsmtemp(ig,n1)) * mygpvar2
                schstemp = schstemp + matngmatmgp*schs + matngpmatmg*MYCONJG(schs)
              enddo
              ig = igp
              schs=-I_eps_array(ig,my_igp)
              matngmatmgp = aqsntemp(ig,n1) * mygpvar1
              if (abs(schs).gt.TOL_Small) schstemp = schstemp + matngmatmgp*schs
            else
              do ig = 1, igmax
                !schs=-I_eps_array(ig,my_igp)
! JRD: Cycle bad for vectorization. 
! I_eps_array is already set to zero above for these ig,igp
!             if (abs(schs).lt.TOL_Small) cycle
                !matngmatmgp = aqsntemp(ig,n1) * mygpvar1
                schstemp = schstemp - aqsntemp(ig,n1) * I_eps_array(ig,my_igp) * mygpvar1
                !schstemp = schstemp + matngmatmgp * schs
              enddo
            endif            

            achstemp = achstemp + schstemp*vcoul(igp)*0.5d0
          enddo
!$OMP END PARALLEL DO
        endif ! sig%exact_ch.eq.1

        call timacc(51,2)

!!!!--- Loop over three energy values which we compute Sigma -----------
!
! SIB: In terms of formulae, the two terms we will compute are given in
! formulae (34a,34b) on p. 5397 of above reference.
!
!                                         Omega^2(G,G`)
!    SX(E) = M(n,G)*conj(M(m,G`)) * ------------------------ * Vcoul(G`)
!                                   (E-E_n1(k-q))^2-wtilde^2
!
!                                           Omega^2(G,G`)
!    CH(E) = M(n,G)*conj(M(m,G`)) * ----------------------------- * Vcoul(G`)
!                                   2*wtilde*[E-E_n1(k-q)-wtilde]
!
! and we are evaluating both at E = E_l(k) and E = E_l(k) +/- dE.
! SX only gets contributions for the band n1 being an occupied state.
!
! For diagonal elements, we need matrix elements at various
! energies (to get quasi-particle energies), i.e. for iw=1,2,3;
! but for off-diagonal elements, we only need them at iw=2
! (the actual energy) so we won`t even bother calculating
! them at other energies

        do iw=nstart,nend
! wx = E_l(k) - E_n1(k-q) + dE = difference in energies
! of the two states appearing as E - E_n1(k-q) above.
          if (sig%fdf .eq. -3) then
            wx_array(iw) = e_lk - e_n1kq + wxi(iw)
          else 
            wx_array(iw) = e_lk - e_n1kq + sig%dw*(iw-2)
          endif
          if (abs(wx_array(iw)) .lt. TOL_Zero) wx_array(iw) = TOL_Zero
        enddo

        call timacc(52,1)

! JRD: This Loop is Performance critical. Make Sure you don`t mess it up

!$OMP PARALLEL private (my_igp,igp,indigp,igmax,mygpvar1,mygpvar2,ssx_array,sch_array,ig, &
!$OMP                      wtilde,wtilde2,halfinvwtilde,ssxcutoff,matngmatmgp,matngpmatmg,sch,ssx, &
!$OMP                      iw,delw,delw2,Omega2,scht,ssxt,wxt, &
!$OMP                      rden,cden,delwr,wdiffr,occfact)

        if (sig%fdf.eq.-3) then
          SAFE_ALLOCATE(ssx_array,(sig%nfreqeval))
          SAFE_ALLOCATE(sch_array,(sig%nfreqeval))
        else
          SAFE_ALLOCATE(ssx_array,(3))
          SAFE_ALLOCATE(sch_array,(3))
        endif

!$OMP DO reduction(+:asxtemp,acht_n1_loc,achtemp)
        do my_igp = 1, ngpown

          indigp = inv_igp_index(my_igp)
          igp = indinv(indigp)

          if (igp .gt. ncouls .or. igp .le. 0) cycle

          if (sig%ggpsum.eq.1) then
            igmax=igp
          else
            igmax=ncouls
          endif

          ssx_array = ZERO
          sch_array = ZERO

          mygpvar1 = MYCONJG(aqsmtemp(igp,n1))
          mygpvar2 = aqsntemp(igp,n1)

! delw measures how close to zero the difference
! wx - wtilde = E - E_n1(k-q) - wtilde is relative to wtilde:
! delw = (E - E_n1(k-q) - wtilde) / (2 * wtilde)
! delw2 is the squared absolute value of delw
      
! If delw is small, both SX and CH blow up, but their sum (for
! an occupied state n1) is finite.  In such a case, their sum
! is Omega^2 / (4 * wtilde2) / (1 + delw). Then the sum is
! assigned to ssx and sch is set to zero.

! If ssx is too large (which can happen due to the pole at
! wx + wtilde = 0 of the SX term), then we should drop this term.
! See the discussion at the bottom of p. 5411-5412 of Hybertsen & Louie.

! If G.neq.G`, then since we sum over only lower triangle,
! we include the contribution we would have had from (G`,G).

          if (flag_occ) then

            do iw=nstart,nend

              scht=0D0
              ssxt=0D0
              wxt = wx_array(iw)

              if (sig%ggpsum.eq.1) then

                do ig = 1, igmax

! Here we recompute Omega2 = wtilde2 * I_eps_array. This contains
! the factor of (1 - i tan phi) from Eqs. 21 & 22 of arXiv paper.

!FIXME: Here we use temporary variables wtilde, wtilde2, Omega2 while
! in the following sections we use wtilde_array and I_eps_array directly.
! JRD, please write a comment here explaining whether this is critical
! for performance or it doesn`t matter.

                  wtilde = wtilde_array(ig,my_igp)
                  wtilde2 = wtilde**2
                  Omega2 = wtilde2 * I_eps_array(ig,my_igp)

! Cycle bad for vectorization. ggpsum=1 slow anyway
                  if (abs(Omega2) .lt. TOL_Zero) cycle

                  matngmatmgp = aqsntemp(ig,n1) * mygpvar1
! JRD: This breaks vectorization but ggpsum=1 is slow for other reasons already
                  if (ig .ne. igp) matngpmatmg = MYCONJG(aqsmtemp(ig,n1)) * mygpvar2

                  halfinvwtilde = 0.5d0/wtilde
                  delw = (wxt - wtilde) * halfinvwtilde
                  delw2 = abs(delw)**2

                  if (abs(wxt-wtilde).lt.sig%gamma .or. delw2.lt.TOL_Small) then
                    sch = 0.0d0
                    if (abs(wtilde) .gt. Tol_Zero) then
                      ssx = -Omega2 / (4.0d0 * wtilde2 * (1.0d0 + delw))
                    else
                      ssx = 0D0
                    endif
                  else
                    sch = wtilde * I_eps_array(ig,my_igp) / (wxt - wtilde)
                    ssx = Omega2 / (wxt**2 - wtilde2)
                  endif

! JRD: Bad for vectorization
                  ssxcutoff = sig%sexcut*abs(I_eps_array(ig,my_igp))
                  if (abs(ssx) .gt. ssxcutoff .and. wxt .lt. 0.0d0) ssx=0.0d0

                  if (ig .ne. igp) then
                    ssxt = ssxt + matngmatmgp*ssx + matngpmatmg*MYCONJG(ssx)
                    scht = scht + matngmatmgp*sch + matngpmatmg*MYCONJG(sch)
                  else
                    ssxt = ssxt + matngmatmgp*ssx
                    scht = scht + matngmatmgp*sch
                  endif

                enddo ! loop over g

              else

                do ig = 1, igmax

! Here we recompute Omega2 = wtilde2 * I_eps_array. This contains
! the factor of (1 - i tan phi) from Eqs. 21 & 22 of arXiv paper.

!FIXME: Here we use temporary variables wtilde, wtilde2, Omega2 while
! in the following sections we use wtilde_array and I_eps_array directly.
! JRD, please write a comment here explaining whether this is critical
! for performance or it doesn`t matter.
  
                  wtilde = wtilde_array(ig,my_igp)
                  wtilde2 = wtilde**2
                  Omega2 = wtilde2 * I_eps_array(ig,my_igp)

! Cycle bad for vectorization. Not needed wtilde is zero
!                  if (abs(Omega2) .lt. TOL_Zero) cycle

                  matngmatmgp = aqsntemp(ig,n1) * mygpvar1

                  wdiff = wxt - wtilde

                  cden = wdiff
                  rden = cden * CONJG(cden)
                  rden = 1D0 / rden
                  delw = wtilde * CONJG(cden) * rden
                  delwr = delw*CONJG(delw)
                  wdiffr = wdiff*CONJG(wdiff)

! This Practice is bad for vectorization and understanding of the output.
! JRD: Complex division is hard to vectorize. So, we help the compiler.
                  if (wdiffr.gt.limittwo .and. delwr.lt.limitone) then
                    sch = delw * I_eps_array(ig,my_igp)
                    cden = wxt**2 - wtilde2
                    rden = cden*CONJG(cden)
                    rden = 1D0 / rden                    
                    ssx = Omega2 * CONJG(cden) * rden
                  else if ( delwr .gt. TOL_Zero) then
                    sch = 0.0d0
                    cden = (4.0d0 * wtilde2 * (delw + 0.5D0 ))
                    rden = cden*MYCONJG(cden)
                    rden = 1D0 / rden
                    ssx = -Omega2 * MYCONJG(cden) * rden * delw
                  else
                    sch = 0.0d0
                    ssx = 0.0d0
                  endif

! JRD: Breaks vectorization. But, I will have to fix later because
! leaving it out breaks GSM example.
                  ssxcutoff = sig%sexcut*abs(I_eps_array(ig,my_igp))
                  if (abs(ssx) .gt. ssxcutoff .and. wxt .lt. 0.0d0) ssx=0.0d0

                  ssxt = ssxt + matngmatmgp*ssx
                  scht = scht + matngmatmgp*sch

                enddo ! loop over g

              endif

! JRD: Not ideal for fdf -3, but user should see spectrum.dat

              ssx_array(iw) = ssx_array(iw) + ssxt
              sch_array(iw) = sch_array(iw) + 0.5D0*scht

            enddo

          else

            do iw=nstart,nend

              scht=0D0
              ssxt=0D0
              wxt = wx_array(iw)

              if (sig%ggpsum.eq.1) then

                do ig = 1, igmax

! Here we recompute Omega2 = wtilde2 * I_eps_array. This contains
! the factor of (1 - i tan phi) from Eqs. 21 & 22 of arXiv paper.

!FIXME: Here we use wtilde_array and I_eps_array directly while in the
! previous sections we use temporary variables wtilde, wtilde2, Omega2.
! JRD, please write a comment here explaining whether this is critical
! for performance or it doesn`t matter.

                  wtilde = wtilde_array(ig,my_igp)
            
! Cycle bad for vectorization. ggpsum=1 slow anyway
                  if (abs((wtilde**2) * I_eps_array(ig,my_igp)) .lt. TOL_Zero) cycle

                  matngmatmgp = aqsntemp(ig,n1) * mygpvar1
! JRD: If statement breaks vectorization but gppsum=1 is slow anyway
                  if (ig .ne. igp) matngpmatmg = MYCONJG(aqsmtemp(ig,n1)) * mygpvar2

                  halfinvwtilde = 0.5d0/wtilde
                  delw = (wx_array(iw) - wtilde) * halfinvwtilde
                  delw2 = abs(delw)**2

                  if (abs(wxt-wtilde).lt.sig%gamma .or. delw2.lt.TOL_Small) then
                    sch = 0.0d0
                  else
                    sch = wtilde_array(ig,my_igp) * I_eps_array(ig,my_igp) / (wxt - wtilde_array(ig,my_igp))
                  endif

                  if (ig .ne. igp) then
                    scht = scht + matngmatmgp*sch + matngpmatmg*MYCONJG(sch)
                  else
                    scht = scht + matngmatmgp*sch
                  endif 

                enddo ! loop over g

              else

                do ig = 1, igmax

! Here we recompute Omega2 = wtilde2 * I_eps_array. This contains
! the factor of (1 - i tan phi) from Eqs. 21 & 22 of arXiv paper.

!FIXME: Here we use wtilde_array and I_eps_array directly while in the
! previous sections we use temporary variables wtilde, wtilde2, Omega2.
! JRD, please write a comment here explaining whether this is critical
! for performance or it doesn`t matter.
            
! Cycle bad for vectorization. Not needed witlde is zero
                  !if (abs((wtilde_array(ig,my_igp)**2) * I_eps_array(ig,my_igp)) .lt. TOL_Zero) cycle

                  wdiff = wxt - wtilde_array(ig,my_igp)

                  cden = wdiff
                  rden = cden * CONJG(cden)
                  rden = 1D0 / rden
                  delw = wtilde_array(ig,my_igp) * CONJG(cden) * rden
                  delwr = delw*CONJG(delw)
                  wdiffr = wdiff*CONJG(wdiff)

! JRD: This if is OK for vectorization
                  if (wdiffr.gt.limittwo .and. delwr.lt.limitone) then
                    scht = scht + mygpvar1 * aqsntemp(ig,n1) * delw * I_eps_array(ig,my_igp)
                  endif

                enddo ! loop over g

              endif

              sch_array(iw) = sch_array(iw) + 0.5D0*scht

            enddo

          endif

! If a valence band, then accumulate SX contribution.

          if (flag_occ) then
            do iw=nstart,nend
              asxtemp(iw) = asxtemp(iw) - ssx_array(iw) * occ * vcoul(igp)
            enddo
          endif

! Accumulate CH contribution.

          do iw=nstart,nend
            achtemp(iw) = achtemp(iw) + sch_array(iw) * vcoul(igp)
          enddo

! Logging CH convergence.

          acht_n1_loc(n1true) = acht_n1_loc(n1true) + sch_array(2) * vcoul(igp)

        enddo ! igp
!$OMP END DO
        SAFE_DEALLOCATE(ssx_array)
        SAFE_DEALLOCATE(sch_array)
!$OMP END PARALLEL
        call timacc(52,2)
      enddo ! over ipe bands (n1)
    enddo ! over ipe

    do iw=nstart,nend
      asxt(iw) = asxt(iw) + asxtemp(iw)
      acht(iw) = acht(iw) + achtemp(iw)
    enddo

    if (sig%exact_ch .eq. 1) then
      achtcor = achtcor + 0.5d0 * (achxtemp - achstemp)
    endif

    SAFE_DEALLOCATE(aqsntemp)
    SAFE_DEALLOCATE(aqsmtemp)
    SAFE_DEALLOCATE(I_eps_array)
    SAFE_DEALLOCATE(wtilde_array)

    if (sig%ggpsum.eq.1) then
      ncouls2 = ncouls**2/2
    else
      ncouls2 = ncouls**2
    endif

! JRD group comm

#ifdef MPI
    if (peinf%my_pool .ne. -1) then
      call MPI_REDUCE(cycle_count,cycle_count_total,1,MPI_INTEGER8,MPI_SUM, 0,peinf%pool_comm,mpierr)
    endif
#endif

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

  else if (sig%freq_dep .eq. 2) then

    asxDtemp(:) = (0.0d0,0.0d0)
    achDtemp(:) = (0.0d0,0.0d0)
    achDtemp_cor(:) = (0.0d0,0.0d0)
    achDtemp_corb(:) = (0.0d0,0.0d0)
    ach2Dtemp(:) = (0.0d0,0.0d0)
    if (sig%exact_ch.eq.1) achsDtemp = (0.0d0, 0.0d0)

    SAFE_ALLOCATE(aqsntemp,(ncouls,peinf%ntband_max))
    SAFE_ALLOCATE(aqsmtemp,(ncouls,peinf%ntband_max))

    do ipe = 1, peinf%npes_pool

      call timacc(43,1)

      if (peinf%pool_rank .eq. ipe-1) then
        aqsntemp(:,:) = aqsn(1:ncouls,:)
        aqsmtemp(:,:) = aqsm(1:ncouls,:)
      endif

! JRD BCAST in just pool

#ifdef MPI
      if (peinf%my_pool .ne. -1) then
        call MPI_Bcast(aqsntemp,peinf%ntband_max*ncouls,MPI_SCALAR,ipe-1,peinf%pool_comm,mpierr)
        call MPI_Bcast(aqsmtemp,peinf%ntband_max*ncouls,MPI_SCALAR,ipe-1,peinf%pool_comm,mpierr)
      endif
#endif

      call timacc(43,2)

      do n1=1,peinf%ntband_dist(ipe)

! n1true = "True" band index of the band n1 w.r.t. all bands

        call timacc(51,1)

        n1true = peinf%indext_dist(n1,ipe)

! energy of the |n1,k-q> state

        e_n1kq = wfnkq%ekq(n1true,ispin)

! occupation of the |n1,k-q> state

        flag_occ = (n1true.le.(sig%nvband+sig%ncrit)) &
          .and.((sig%ncrit.eq.0).or.(e_n1kq.le.(sig%efermi+sig%tol)))
          
        tempval=abs(e_n1kq-sig%efermi)
        if (tempval .lt. sig%tol) then
          occ = 0.5d0 ! Fermi-Dirac distribution = 1/2 at Fermi level
        else
          occ = 1.0d0
        endif

! JRD: Find iw closest to e_lk

        diffmin = INF
        do iw=1,sig%nfreqeval
          diff = abs(sig%freqevalmin + (iw-1)*sig%freqevalstep - e_lk)
          if (diff .lt. diffmin) then
            diffmin=diff
            iwlda=iw
          endif
        enddo

        do iw=1,sig%nfreqeval
          wx = sig%freqevalmin - e_n1kq + (iw-1)*sig%freqevalstep
          wxi(iw) = wx
        enddo

! (gsm) compute the static CH for the static remainder

!$OMP PARALLEL do private (my_igp,igp,indigp,igmax,mygpvar1,ig)
        do my_igp = 1, ngpown
          indigp = inv_igp_index(my_igp)
          igp = indinv(indigp)

          if (igp .gt. ncouls .or. igp .le. 0) cycle

          igmax=ncouls

          mygpvar1 = MYCONJG(aqsmtemp(igp,n1))

          do ig = 1, igmax
            matngmatmgpD(ig,my_igp) = aqsntemp(ig,n1) * mygpvar1
          enddo
        enddo
!$OMP END PARALLEL DO

        if (sig%exact_ch.eq.1) then
!$OMP PARALLEL do private (my_igp,igp,indigp,igmax,ig, &
!$OMP                      schsDtemp ) reduction(+:achsDtemp)
          do my_igp = 1, ngpown
            indigp = inv_igp_index(my_igp)
            igp = indinv(indigp)

            if (igp .gt. ncouls .or. igp .le. 0) cycle

            igmax=ncouls

            schsDtemp = ZERO
            do ig = 1, igmax
              schsDtemp = schsDtemp-matngmatmgpD(ig,my_igp)*I_epsR_array(ig,my_igp,1)
              !schsDtemp = schsDtemp-matngmatmgpD(ig,my_igp)*I_epsR_array(1,ig,my_igp)
            enddo
            achsDtemp = achsDtemp + schsDtemp*vcoul(igp)*0.5D0
          enddo
!$OMP END PARALLEL DO
        endif

        ssxDi = (0D0,0D0)
        schDi = (0D0,0D0)
        schDi_cor = (0D0,0D0)
        schDi_corb = (0D0,0D0)
        sch2Di = (0D0,0D0)

        call timacc(51,2)

! JRD: Don`t want to thread here, nfreqeval could be small
        do iw=1,sig%nfreqeval
            
          wx = wxi(iw)

! SX and CH terms: equation (1.42) of Catalin`s thesis
! Note the negative sign in I_epsRggp and I_epsAggp

          if (flag_occ) then

            if(wx.ge.0.0d0) then
              ifreq=0
              do ijk = 1, sig%nFreq-1
                if (wx .ge. sig%dFreqGrid(ijk) .and. wx .lt. sig%dFreqGrid(ijk+1)) then
                  ifreq=ijk
                endif
              enddo
              if (ifreq .eq. 0) then
                ifreq = sig%nFreq+3 ! three is for luck
              endif
            else
              ifreq=0
              do ijk = 1, sig%nFreq-1
                if (-wx .ge. sig%dFreqGrid(ijk) .and. -wx .lt. sig%dFreqGrid(ijk+1)) then
                  ifreq=ijk
                endif
              enddo
              if (ifreq .eq. 0) then
                ifreq = sig%nFreq+3 ! three is for luck
              endif
            endif
              
            if(ifreq.ge.sig%nFreq) then
              write(0,777) iband,n1true,e_lk,e_n1kq,wx,E_max
              ifreq=sig%nFreq-1
            endif
777         format(1x,"WARNING: The real frequency range is too small." &
              ,/,3x,"l =",i3,1x,"n1 =",i5,1x,"E_l =",f8.3,1x,"E_n1" &
              ,1x,"=",f8.3,1x,"wx =",f8.3,1x,"E_max =",f8.3)
              
#ifdef CPLX
            if(wx.ge.0.d0) then

              call timacc(53,1)

              fact1 = (sig%dFreqGrid(ifreq+1)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
              fact2 = (wx-sig%dFreqGrid(ifreq))/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))

              ssxDittt = 0D0

!$OMP PARALLEL do private (my_igp,igp,indigp,igmax,ssxDitt,ig, &
!$OMP                       ssxDit) reduction(+:ssxDittt)
              do my_igp = 1, ngpown
                indigp = inv_igp_index(my_igp)
                igp = indinv(indigp)

                if (igp .gt. ncouls .or. igp .le. 0) cycle

                igmax=ncouls

                ssxDitt = (0D0,0D0)
                do ig = 1, igmax
                  ssxDit=I_epsR_array(ig,my_igp,ifreq)*fact1 + &
                  I_epsR_array(ig,my_igp,ifreq+1)*fact2 
                  !ssxDit=I_epsR_array(ifreq,ig,my_igp)*fact1 + &
                  !I_epsR_array(ifreq+1,ig,my_igp)*fact2 
 
                  ssxDitt = ssxDitt + matngmatmgpD(ig,my_igp)*ssxDit
                enddo
                ssxDittt = ssxDittt + ssxDitt*vcoul(igp)
              enddo
!$OMP END PARALLEL DO

              ssxDi(iw) = ssxDi(iw) + ssxDittt

              call timacc(53,2)

            else

              call timacc(53,1)

              fact1 = (sig%dFreqGrid(ifreq+1)+wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
              fact2 = (-sig%dFreqGrid(ifreq)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))

              ssxDittt = 0D0

!$OMP PARALLEL do private (my_igp,igp,indigp,igmax,ssxDitt,ig, &
!$OMP                       ssxDit) reduction(+:ssxDittt)
              do my_igp = 1, ngpown
                indigp = inv_igp_index(my_igp)
                igp = indinv(indigp)

                if (igp .gt. ncouls .or. igp .le. 0) cycle

                igmax=ncouls

                ssxDitt = (0D0,0D0)
                do ig = 1, igmax
                  ssxDit=I_epsA_array(ig,my_igp,ifreq)*fact1+ &
                    I_epsA_array(ig,my_igp,ifreq+1)*fact2
                  !ssxDit=I_epsA_array(ifreq,ig,my_igp)*fact1+ &
                  !  I_epsA_array(ifreq+1,ig,my_igp)*fact2

                  ssxDitt = ssxDitt + matngmatmgpD(ig,my_igp)*ssxDit
                enddo
                ssxDittt = ssxDittt + ssxDitt*vcoul(igp)
              enddo                
!$OMP END PARALLEL DO

              ssxDi(iw) = ssxDi(iw) + ssxDittt

              call timacc(53,2)

            endif
#else
            if(wx.ge.0.d0) then

              call timacc(53,1)

              fact1 = (sig%dFreqGrid(ifreq+1)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
              fact2 = (wx-sig%dFreqGrid(ifreq))/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))

              ssxDittt = 0D0

!$OMP PARALLEL do private (my_igp,igp,indigp,igmax,ssxDitt,ig, &
!$OMP                       ssxDit) reduction(+:ssxDittt)
              do my_igp = 1, ngpown
                indigp = inv_igp_index(my_igp)
                igp = indinv(indigp)

                if (igp .gt. ncouls .or. igp .le. 0) cycle

                igmax=ncouls

                ssxDitt = (0D0,0D0)
                do ig = 1, igmax
                  ssxDit=I_epsR_array(ig,my_igp,ifreq)*fact1+ &
                    I_epsR_array(ig,my_igp,ifreq+1)*fact2
                  !ssxDit=I_epsR_array(ifreq,ig,my_igp)*fact1+ &
                  !  I_epsR_array(ifreq+1,ig,my_igp)*fact2
                  ssxDitt = ssxDitt + matngmatmgpD(ig,my_igp)*ssxDit
                enddo
                ssxDittt = ssxDittt + ssxDitt * vcoul(igp)
              enddo
!$OMP END PARALLEL DO

              ssxDi(iw) = ssxDi(iw) + ssxDittt

              call timacc(53,2)

            else

              call timacc(53,1)

              fact1 = (sig%dFreqGrid(ifreq+1)+wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
              fact2 = (-sig%dFreqGrid(ifreq)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))

              ssxDittt = 0D0

!$OMP PARALLEL do private (my_igp,igp,indigp,igmax,ssxDitt,ig, &
!$OMP                       ssxDit) reduction(+:ssxDittt) 
              do my_igp = 1, ngpown
                indigp = inv_igp_index(my_igp)
                igp = indinv(indigp)

                if (igp .gt. ncouls .or. igp .le. 0) cycle

                igmax=ncouls

                ssxDitt = (0D0,0D0)
                do ig = 1, igmax
                  ssxDit=CONJG(I_epsR_array(ig,my_igp,ifreq))*fact1 + &
                    CONJG(I_epsR_array(ig,my_igp,ifreq+1))*fact2
                  !ssxDit=CONJG(I_epsR_array(ifreq,ig,my_igp))*fact1 + &
                  !  CONJG(I_epsR_array(ifreq+1,ig,my_igp))*fact2
                  ssxDitt = ssxDitt + matngmatmgpD(ig,my_igp)*ssxDit
                enddo
                ssxDittt = ssxDittt + ssxDitt * vcoul(igp)
              enddo
!$OMP END PARALLEL DO

              ssxDi(iw) = ssxDi(iw) + ssxDittt

              call timacc(53,2)
            endif
#endif
          endif
        enddo

! JRD: Now do CH term
          
        call timacc(49,1)

        SAFE_ALLOCATE(schDt_array,(sig%Nfreq))
        schDt_array(:) = 0D0
 
        call timacc(52,1)

!$OMP PARALLEL do private (my_igp,igp,indigp,igmax,ig,schDtt,I_epsRggp_int, &
!$OMP                      I_epsAggp_int,schD,schDt)
        do ifreq=1,sig%Nfreq

            schDt = (0D0,0D0)

            do my_igp = 1, ngpown
              indigp = inv_igp_index(my_igp)
              igp = indinv(indigp)

              if (igp .gt. ncouls .or. igp .le. 0) cycle

              igmax=ncouls

! JRD: The below loop is performance critical

              schDtt = (0D0,0D0)
              do ig = 1, igmax

                I_epsRggp_int = I_epsR_array(ig,my_igp,ifreq)
                !I_epsRggp_int = I_epsR_array(ifreq,ig,my_igp)

#ifdef CPLX
                I_epsAggp_int = I_epsA_array(ig,my_igp,ifreq)
                !I_epsAggp_int = I_epsA_array(ifreq,ig,my_igp)

                ! for G,G` components
                schD=I_epsRggp_int-I_epsAggp_int

                ! for G`,G components
                schDtt = schDtt + matngmatmgpD(ig,my_igp)*schD
#else
                schD= CMPLX(IMAG(I_epsRggp_int),0.0d0)
                schDtt = schDtt + matngmatmgpD(ig,my_igp)*schD
#endif
              enddo
              schDt = schDt + schDtt * vcoul(igp)
            enddo

! XXX: Threads could be stomping on each-other`s cache over this... try reduction?
            schdt_array(ifreq) = schDt

        enddo
!$OMP END PARALLEL DO

        call timacc(52,2)

!$OMP PARALLEL do private (ifreq,schDt,cedifft_zb,cedifft_coh,cedifft_cor, &
!$OMP                      cedifft_zb_right,cedifft_zb_left,schDt_right,schDt_left, &
!$OMP                      schDt_avg,schDt_lin,schDt_lin2,intfact,iw, &
!$OMP                      schDt_lin3) reduction(+:schDi,schDi_corb,schDi_cor,sch2Di) 
        do ifreq=1,sig%Nfreq

            schDt = schDt_array(ifreq)

            cedifft_zb = sig%dFreqGrid(ifreq)
            cedifft_coh = CMPLX(cedifft_zb,0D0)- sig%dFreqBrd(ifreq)

            if( flag_occ) then 
              cedifft_cor = -1.0d0*CMPLX(cedifft_zb,0D0) - sig%dFreqBrd(ifreq)
            else
              cedifft_cor = CMPLX(cedifft_zb,0D0) - sig%dFreqBrd(ifreq)
            endif

            if (ifreq .ne. 1) then 
              cedifft_zb_right = cedifft_zb
              cedifft_zb_left = sig%dFreqGrid(ifreq-1)
              schDt_right = schDt
              schDt_left = schDt_array(ifreq-1)
              schDt_avg = 0.5D0 * ( schDt_right + schDt_left )
              schDt_lin = schDt_right - schDt_left
              schDt_lin2 = schDt_lin/(cedifft_zb_right-cedifft_zb_left)
            endif

#ifdef CPLX
! The below two lines are for sigma1 and sigma3
            if (ifreq .ne. sig%Nfreq) then
              schDi(:) = schDi(:) - CMPLX(0.d0,pref(ifreq)) * schDt / ( wxi(:)-cedifft_coh)
              schDi_corb(:) = schDi_corb(:) - CMPLX(0.d0,pref(ifreq)) * schDt / ( wxi(:)-cedifft_cor)
            endif
            if (ifreq .ne. 1) then
              do iw = 1, sig%nfreqeval
!These lines are for sigma2
                intfact=abs((wxi(iw)-cedifft_zb_right)/(wxi(iw)-cedifft_zb_left))
                if (intfact .lt. 1d-4) intfact = 1d-4
                if (intfact .gt. 1d4) intfact = 1d4
                intfact = -log(intfact)
                sch2Di(iw) = sch2Di(iw) - CMPLX(0.d0,pref_zb) * schDt_avg * intfact
!These lines are for sigma4
                if (flag_occ) then
                  intfact=abs((wxi(iw)+cedifft_zb_right)/(wxi(iw)+cedifft_zb_left))
                  if (intfact .lt. 1d-4) intfact = 1d-4
                  if (intfact .gt. 1d4) intfact = 1d4
                  intfact = log(intfact)
                  schDt_lin3 = (schDt_left + schDt_lin2*(-wxi(iw)-cedifft_zb_left))*intfact
                else 
                  schDt_lin3 = (schDt_left + schDt_lin2*(wxi(iw)-cedifft_zb_left))*intfact
                endif
                schDt_lin3 = schDt_lin3 + schDt_lin
                schDi_cor(iw) = schDi_cor(iw) - CMPLX(0.d0,pref_zb) * schDt_lin3
              enddo
            endif
#else
! The below two lines are for sigma1 and sigma3
            if (ifreq .ne. sig%Nfreq) then
              schDi(:) = schDi(:) + pref(ifreq)*schDt / ( wxi(:)-cedifft_coh)
              schDi_corb(:) = schDi_corb(:) + pref(ifreq)*schDt / ( wxi(:)-cedifft_cor)
            endif
            if (ifreq .ne. 1) then
              do iw = 1, sig%nfreqeval
!These lines are for sigma2
                intfact=abs((wxi(iw)-cedifft_zb_right)/(wxi(iw)-cedifft_zb_left))
                if (intfact .lt. 1d-4) intfact = 1d-4
                if (intfact .gt. 1d4) intfact = 1d4
                intfact = -log(intfact)
                sch2Di(iw) = sch2Di(iw) + pref_zb * schDt_avg * intfact
!These lines are for sigma4
                if (flag_occ) then
                  intfact=abs((wxi(iw)+cedifft_zb_right)/(wxi(iw)+cedifft_zb_left))
                  if (intfact .lt. 1d-4) intfact = 1d-4
                  if (intfact .gt. 1d4) intfact = 1d4
                  intfact = log(intfact)
                  schDt_lin3 = (schDt_left + schDt_lin2*(-wxi(iw)-cedifft_zb_left))*intfact
                else
                  schDt_lin3 = (schDt_left + schDt_lin2*(wxi(iw)-cedifft_zb_left))*intfact
                endif
                schDt_lin3 = schDt_lin3 + schDt_lin
                schDi_cor(iw) = schDi_cor(iw) + pref_zb * schDt_lin3
              enddo
            endif
#endif
        enddo
!$OMP END PARALLEL DO

        SAFE_DEALLOCATE(schDt_array)

        call timacc(49,2)

! JRD: Compute Sigma2 and Sigma4 delta function contributions

        call timacc(50,1)

        do iw = 1, sig%nfreqeval
          wx = wxi(iw)
          if(wx .ge. 0.0d0) then
            ifreq=0
            do ijk = 1, sig%nFreq-1
              if (wx .ge. sig%dFreqGrid(ijk) .and. wx .lt. sig%dFreqGrid(ijk+1)) then
                ifreq=ijk
              endif
            enddo
            if (ifreq .eq. 0) then
              ifreq=sig%nFreq-1
            endif

            fact1=(sig%dFreqGrid(ifreq+1)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
            fact2=(wx-sig%dFreqGrid(ifreq))/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))

            schDttt = 0D0
            schDttt_cor = 0D0

!$OMP PARALLEL do private (my_igp,igp,indigp,igmax,ig, &
!$OMP                      sch2Dt,sch2Dtt) reduction(+:schDttt,schDttt_cor) 
            do my_igp = 1, ngpown
              indigp = inv_igp_index(my_igp)
              igp = indinv(indigp)

              if (igp .gt. ncouls .or. igp .le. 0) cycle

              igmax=ncouls

              sch2Dtt = (0D0,0D0)
              do ig = 1, igmax
#ifdef CPLX
                sch2Dt=-0.5D0*((I_epsR_array(ig,my_igp,ifreq)-I_epsA_array(ig,my_igp,ifreq))*fact1 + &
                       (I_epsR_array(ig,my_igp,ifreq+1)-I_epsA_array(ig,my_igp,ifreq+1))*fact2)
                !sch2Dt=-0.5D0*((I_epsR_array(ifreq,ig,my_igp)-I_epsA_array(ifreq,ig,my_igp))*fact1 + &
                !       (I_epsR_array(ifreq+1,ig,my_igp)-I_epsA_array(ifreq+1,ig,my_igp))*fact2)
#else
                sch2Dt = CMPLX(0D0,-1D0)* &
                  (IMAG(I_epsR_array(ig,my_igp,ifreq))*fact1 + IMAG(I_epsR_array(ig,my_igp,ifreq+1))*fact2)
                !sch2Dt = CMPLX(0D0,-1D0)* &
                !  (IMAG(I_epsR_array(ifreq,ig,my_igp))*fact1 + IMAG(I_epsR_array(ifreq+1,ig,my_igp))*fact2)
#endif
                sch2Dtt = sch2Dtt + matngmatmgpD(ig,my_igp)*sch2Dt
              enddo
              schDttt = schDttt + sch2Dtt*vcoul(igp)
              if (flag_occ) then
              else
                schDttt_cor = schDttt_cor + sch2Dtt*vcoul(igp)
              endif
            enddo
!$OMP END PARALLEL DO

            sch2Di(iw) = sch2Di(iw) + schDttt
            schDi_cor(iw) = schDi_cor(iw) + schDttt_cor
          else if (flag_occ) then
            wx=-wx
            ifreq=0
            do ijk = 1, sig%nFreq-1
              if (wx .ge. sig%dFreqGrid(ijk) .and. wx .lt. sig%dFreqGrid(ijk+1)) then
                ifreq=ijk
              endif
            enddo
            if (ifreq .eq. 0) then
              ifreq=sig%nFreq-1
            endif

            fact1=(sig%dFreqGrid(ifreq+1)-wx)/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))
            fact2=(wx-sig%dFreqGrid(ifreq))/(sig%dFreqGrid(ifreq+1)-sig%dFreqGrid(ifreq))

            schDttt_cor = 0D0

!$OMP PARALLEL do private (my_igp,igp,indigp,igmax,ig, &
!$OMP                      sch2Dt,sch2Dtt) reduction(+:schDttt_cor) 
            do my_igp = 1, ngpown
              indigp = inv_igp_index(my_igp)
              igp = indinv(indigp)

              if (igp .gt. ncouls .or. igp .le. 0) cycle

              igmax=ncouls

              sch2Dtt = (0D0,0D0)
              do ig = 1, igmax
#ifdef CPLX
                sch2Dt=-0.5D0*((I_epsR_array(ig,my_igp,ifreq)-I_epsA_array(ig,my_igp,ifreq))*fact1 + &
                       (I_epsR_array(ig,my_igp,ifreq+1)-I_epsA_array(ig,my_igp,ifreq+1))*fact2)
                !sch2Dt=-0.5D0*((I_epsR_array(ifreq,ig,my_igp)-I_epsA_array(ifreq,ig,my_igp))*fact1 + &
                !       (I_epsR_array(ifreq+1,ig,my_igp)-I_epsA_array(ifreq+1,ig,my_igp))*fact2)
#else
                sch2Dt = CMPLX(0D0,-1D0)* &
                  (IMAG(I_epsR_array(ig,my_igp,ifreq))*fact1 + IMAG(I_epsR_array(ig,my_igp,ifreq+1))*fact2)
                !sch2Dt = CMPLX(0D0,-1D0)* &
                !  (IMAG(I_epsR_array(ifreq,ig,my_igp))*fact1 + IMAG(I_epsR_array(ifreq+1,ig,my_igp))*fact2)
#endif
                sch2Dtt = sch2Dtt + matngmatmgpD(ig,my_igp)*sch2Dt
              enddo
              schDttt_cor = schDttt_cor + sch2Dtt*vcoul(igp)
            enddo
!$OMP END PARALLEL DO
            schDi_cor(iw) = schDi_cor(iw) + schDttt_cor
          endif
        enddo

        call timacc(50,2)

        do iw = 1, sig%nfreqeval
            
          if (flag_occ) then
            asxDtemp(iw) = asxDtemp(iw) + ssxDi(iw)*occ
          endif
            
          achDtemp(iw) = achDtemp(iw) + schDi(iw)
          achDtemp_cor(iw) = achDtemp_cor(iw) + schDi_cor(iw)
          achDtemp_corb(iw) = achDtemp_corb(iw) + schDi_corb(iw)
          ach2Dtemp(iw) = ach2Dtemp(iw) + sch2Di(iw)
  
! JRD: This is now close to LDA

          if (iw.eq.iwlda) achtD_n1(n1true) = &
            achtD_n1(n1true) + schDi(iw)

        enddo ! over iw
          
      enddo ! over bands (n1)
    enddo ! over bands (ipe)

    do iw=1,sig%nfreqeval
      asxtDyn(iw) = asxtDyn(iw) + asxDtemp(iw)
      achtDyn(iw) = achtDyn(iw) + achDtemp(iw)
      achtDyn_cor(iw) = achtDyn_cor(iw) + achDtemp_cor(iw)
      achtDyn_corb(iw) = achtDyn_corb(iw) + achDtemp_corb(iw)
      ach2tDyn(iw) = ach2tDyn(iw) + ach2Dtemp(iw)
    enddo

!------------------------
! (gsm) compute the static remainder
!       a factor of 0.5 accounts for the fact that GPP/FF CH converges
!       with the number of empty bands two times faster than COHSEX CH

    if (sig%exact_ch .eq. 1) then
      achtcor = achtcor + 0.5d0 * (achxtemp - SCALARIFY(achsDtemp))
    endif

    SAFE_DEALLOCATE(I_epsR_array)
#ifdef CPLX
    SAFE_DEALLOCATE(I_epsA_array)
#endif
    SAFE_DEALLOCATE(matngmatmgpD)
    SAFE_DEALLOCATE(matngpmatmgD)

    SAFE_DEALLOCATE(aqsntemp)
    SAFE_DEALLOCATE(aqsmtemp)

  endif ! Freq Dependence

!-----------------------
! Deallocate and Finish

  if (sig%freq_dep.eq.1 .or.sig%freq_dep.eq.3) then
    SAFE_DEALLOCATE(wx_array)
    if (sig%fdf.eq.-3) then
      SAFE_DEALLOCATE(wxi)
    endif
  endif 

  if (sig%freq_dep.eq.1 .or. sig%freq_dep.eq.0 .or. sig%freq_dep.eq.3)  then
    SAFE_DEALLOCATE(asxtemp)
    SAFE_DEALLOCATE(achtemp)
    acht_n1(1:sig%ntband) = acht_n1_loc(1:sig%ntband)
  endif

  if (sig%freq_dep.eq.2) then
    SAFE_DEALLOCATE(pref)
    SAFE_DEALLOCATE(asxDtemp)
    SAFE_DEALLOCATE(achDtemp)
    SAFE_DEALLOCATE(achDtemp_cor)
    SAFE_DEALLOCATE(achDtemp_corb)
    SAFE_DEALLOCATE(ach2Dtemp)
    SAFE_DEALLOCATE(schDi)
    SAFE_DEALLOCATE(schDi_cor)
    SAFE_DEALLOCATE(schDi_corb)
    SAFE_DEALLOCATE(sch2Di)
    SAFE_DEALLOCATE(ssxDi)
    SAFE_DEALLOCATE(wxi)
  endif

  POP_SUB(mtxel_sxch)
  
  return
end subroutine mtxel_sxch

end module mtxel_sxch_m
