!===============================================================================
!
! Routines:
!
! (1) sigma              Originally By MSH        Last Modified 10/5/2009 (gsm)
!
! This is the main routine for the Sigma code.  Please see the documentation
! in the README for more information on this code.
!
!===============================================================================

#include "f_defs.h"

program sigma

  use global_m
  use checkbz_m
  use checkgriduniformity_m
  use check_screening_m
  use fixwings_m
  use fftw_m
  use fullbz_m
  use gmap_m
  use input_utils_m
  use irrbz_m
  use misc_m
  use mtxel_sxch_m
  use sort_m
  use vcoul_generator_m
  use wfn_rho_vxc_io_m
  use write_result_m
  use write_result_dyn_m
  use write_result_dyn_hp_m
  use write_result_hp_m
  use io_utils_m
#ifdef HDF5
  use hdf5
#endif
  use epsread_hdf5_m
  use epscopy_m
  use io_utils_m
  implicit none

!---------------------
! Derived Types

  type (crystal) :: crys
  type (symmetry) :: syms
  type (gspace) :: gvec
  type (kpoints) :: kp
  type (siginfo) :: sig
  type (wpgen) :: wpg
  type (wfnkstates) :: wfnk,wfnkoff
  type (wfnkqstates) :: wfnkq
  type (epsmpiinfo) :: epsmpi
  type (wfnkqmpiinfo) :: wfnkqmpi
  type (wfnkmpiinfo) :: wfnkmpi
  type (twork_scell) :: work_scell
  type(progress_info) :: prog_info !< a user-friendly progress report

!---------------------
! k-points for the sum over BZ (the rq-points)

  integer :: nm,nrq,ijk,iout,iparallel
  integer, allocatable :: neq(:),indrq(:),itnrq(:),kg0(:,:)
  real(DP), allocatable :: rq(:,:)
  type(grid) :: gr
  
!---------------------
! (k-q) kpoints involved in sigma summations
! ===> see data type cwfnkq and vwfnkq

  real(DP) :: rkq(3)

!---------------------
! Dielectric matrices

  integer :: ngq,neps,nmtx,ncoul,ncoulch,ncoulb,ncouls
  integer :: ngqt,nmtxt,nfreqgpp
  integer, allocatable :: isrtq(:),isrtqi(:)
  integer, allocatable :: isrtrq(:)
  integer, pointer :: isrtrqi(:)
  SCALAR, pointer :: eps(:,:)
  real(DP), allocatable :: vcoul(:), ekin(:)
  complex(DPC), pointer :: epsR(:,:,:),epsA(:,:,:)

!---------------------
! Matrix elements for sigma

  SCALAR, allocatable :: aqs(:,:),aqsaug(:,:,:,:), &
    alda(:,:),alda2(:,:),ax(:,:),axbis(:,:), &
    asx(:,:,:),ach(:,:,:),asig(:,:), &
    ach_n1q(:,:,:),ach_n1(:,:,:),asxbis(:,:,:),achbis(:,:,:), &
    xdum1(:,:),xdum2(:,:),xdum3(:,:,:)
  SCALAR, pointer :: aqsch(:), aqsaugchd(:,:,:), aqsaugcho(:,:,:), acht_n1(:)
  real(DP), allocatable :: enew(:,:),efsto(:,:),zrenorm(:,:)

  complex(DPC), allocatable :: achcor(:,:), achcorbis(:,:)
  complex(DPC), allocatable :: asig_imag(:,:), asig_imagbis(:,:)

  complex(DPC), allocatable :: asxDyn(:,:,:),achDyn(:,:,:),achDyn_cor(:,:,:), &
    achDyn_corb(:,:,:),ach2Dyn(:,:,:),asigDyn(:,:),achD_n1q(:,:,:), &
    achD_n1(:,:,:),asxDbis(:,:,:),achDbis(:,:,:),achDbis_cor(:,:,:),ach2Dbis(:,:,:), &
    achDbis_corb(:,:,:),xdum1Dyn(:,:),xdum2Dyn(:,:),xdum3Dyn(:,:,:)
  complex (DPC), pointer :: achtD_n1(:)
  complex(DPC), allocatable :: efstoDyn(:,:)

!----------------------
! eps distrib variables

  integer :: ngpown
  integer, allocatable :: igp_owner(:),igp_index(:),inv_igp_index(:)
  SCALAR, allocatable :: epstemp(:)
  
  complex(DPC), allocatable :: epsRtemp(:,:),epsAtemp(:,:)

  character :: tmpstr*120
  character :: tmpfn*16
  character*20 :: fnc,fnk,fne
  character*16 :: routnam(100)
  integer :: routsrt(42),nullvec(3)
  logical :: xflag,eqp1flag,imagvxcflag,imagxflag,found,q0flag,bExactlyZero
  integer :: ig,i,j,k,itran,ikn,ika,ioff,error
  integer :: in,im,iw,ib,idum,kg(3),icurr,jj,ii,ispin,g1,g2
  integer :: ncount,ndum,nbandi,tag,dest,source,nfold,ifold
  integer :: iwlda,irq,irq_,irq_min,n1,ierr
  integer :: s2,iunit_c,iunit_k,iunit_eps,ndv_ikn,ngpown_max,ngpown_rem
  integer, allocatable :: ind(:), indinv(:)
  real(DP) :: fact,coulfact,weight,tempval,occ
  real(DP) :: qshift(3),oneoverq,qlen,q0len,vq(3),qk(3)
  real(DP) :: tsec(2),diffmin,diff,e_lk,avgcut,subcut
  complex(DPC), pointer :: achtDyn(:),achtDyn_cor(:),asxtDyn(:),ach2tDyn(:),achtDyn_corb(:)
  SCALAR :: achtcor,axt,epshead,asigt_imag
  SCALAR, pointer :: asxt(:), acht(:)
  SCALAR, allocatable :: ph(:)

  logical :: skip_checkbz, is_subq

!--------------- Begin Program -------------------------------------------------

  call peinfo_init()

!----------------------
! Initialize random numbers

  peinf%jobtypeeval = 1

!------------------------
! Initialize timer

  if (peinf%inode.eq.0) then
    call timacc(0,0)
    call timacc(1,1)
  endif

!------------------------
! Initialize files

  call open_file(55,file='sigma.inp',form='formatted',status='old')
  if(peinf%inode == 0) then
    call open_file(7,file='sigma.log',form='formatted',status='replace')
    call open_file(8,file='sigma_hp.log',form='formatted',status='replace')
  endif

  call write_program_header('Sigma', .true.)

!------------------------
! Initialize temporary files

!  if (sig%iwriteint .eq. 0) then
! We do not yet know, until inread inside input, whether we will use comm_disk
! However, we need to pass fnc and fnk to input. Therefore, we initialize these
! variables in all cases, though we will never use them for comm_mpi. --DAS
  iunit_c=10000+peinf%inode
  iunit_k=20000+peinf%inode
  iunit_eps=30000

  if(peinf%inode.lt.1000000) then
    write(fnc,'(a,i6.6)') 'INT_CWFN_', peinf%inode
    write(fnk,'(a,i6.6)') 'INT_WFNK_', peinf%inode
    write(fne,'(a)') 'INT_EPS'
  else
    call die('temp file names: too many nodes')
  endif

!------- Read crys data and wavefunctions from WFN_inner ----------------------------

! JRD: Included in input is the inread routine which reads the
! job parameters from sigma.inp and initializes the XC potential

  if (peinf%inode.eq.0) call timacc(2,1)
  call input(crys,gvec,syms,kp,wpg,sig,wfnk,iunit_c,iunit_k,fnc,fnk,wfnkqmpi,wfnkmpi)
  if (peinf%inode.eq.0) call timacc(2,2)
  if (peinf%inode.eq.0) call timacc(15,1)
  call input_outer(crys,gvec,syms,kp,sig,wfnk,iunit_k,fnk,wfnkmpi)
  if (peinf%inode.eq.0) call timacc(15,2)
  SAFE_DEALLOCATE_P(sig%band_index)
  SAFE_DEALLOCATE_P(sig%kpt)
  SAFE_DEALLOCATE_P(kp%ifmin)
  SAFE_DEALLOCATE_P(kp%ifmax)

!-------------------
! Initialize Various Parameters from inread

! linear extrapolation for eqp1

  eqp1flag=.false.

! imaginary parts of diagonal vxc or exchange matrix elements

  imagvxcflag=.false.
  imagxflag=.false.

! fraction of bare exchange

  if (abs(sig%xfrac).GT.TOL_Small) then
    xflag=.true.
  else
    xflag=.false.
  endif

!---------------------
! Open Various Files 

  if (peinf%inode .eq. 0) then
    if (.not.(sig%freq_dep .eq. 0 .and. sig%exact_ch .eq. 1) .and. .not. (sig%freq_dep == -1)) then
      call open_file(127,file='ch_converge.dat',form='formatted',status='replace')
    endif

    if (sig%iwritecoul .eq. 1) then
      call open_file(19,file='vcoul',form='formatted',status='replace')
    endif

   ! This if for the hybrid functional calculations (one shot) otherwise just open x.dat
    if (sig%coul_mod_flag .and. (.not. sig%use_vxc2dat)) then
      call open_file(121,file='vxc2.dat',form='formatted',status='replace')
    else if ((.not. sig%use_xdat) .and. xflag .and. (.not. sig%coul_mod_flag)) then
      call open_file(119,file='x.dat',form='formatted',status='replace')
    endif

    if (.not. sig%use_vxcdat) then
      call open_file(120,file='vxc.dat',form='formatted',status='replace')
    endif
  endif

!---------------------
! Write header of sigma_hp.log file

  if (peinf%inode.eq.0) then
    write(8,601) sig%freq_dep
    write(8,602) sig%bmin,sig%bmax
    write(8,603) sig%loff,sig%toff
    write(8,604) sig%fdf
    if(sig%fdf /= -2) write(8,605) sig%dw
    write(8,606) syms%ntran
    do itran=1,syms%ntran
      write(8,607) itran, ((syms%mtrx(i,j,itran),i=1,3),j=1,3)
    enddo
    write(8,*)
  endif
601 format(/,1x,"frequency_dependence",i4)
602 format(/,1x,"band_index",2i6)
603 format(1x,"sigma_matrix",i6,i4)
604 format(/,1x,"finite_difference_form",i4)
605 format(1x,"finite_difference_spacing",f10.6)
606 format(/,1x,"symmetries",/,1x,"ntran  =",i3)
607 format(1x,"mtrx",i2.2,1x,"=",9i3)

!---------------------
! JRD: Initialize the Full Frequency output files

  if (peinf%inode.eq.0 .and. (sig%freq_dep.eq.2 .or. (sig%fdf.eq.-3 .and. sig%freq_dep.eq.1))) then
    call open_file(8000,file='spectrum.dat',form='formatted',status='replace')
  endif

!---------------------
! Determine nq and neps
  
! JRD: This performs significantly better with hdf5

  if (peinf%inode.eq.0) call timacc(59,1)

  call epscopy_init(sig, neps)
  if (sig%freq_dep/=-1) then
    ! FHJ: sig%nq and sig%qpt already defined if this is a HF calculation
    sig%nq = sig%nq0 + sig%nq1 
    SAFE_ALLOCATE(sig%qpt, (3,sig%nq))
  endif
  if (sig%nq0==0) call die('There is no q->0 point in your calculation!', only_root_writes=.true.)

!----------------------------
! Allocate arrays

  SAFE_ALLOCATE(wfnkq%isrtkq, (gvec%ng))
  SAFE_ALLOCATE(wfnkq%ekq, (sig%ntband,kp%nspin))
  SAFE_ALLOCATE(alda, (sig%ndiag+sig%noffdiag,sig%nspin))
  SAFE_ALLOCATE(ax, (sig%ndiag+sig%noffdiag,sig%nspin))
  SAFE_ALLOCATE(axbis, (sig%ndiag+sig%noffdiag,sig%nspin))
  SAFE_ALLOCATE(achcor, (sig%ndiag+sig%noffdiag,sig%nspin))
  SAFE_ALLOCATE(asig_imag, (sig%ndiag+sig%noffdiag,sig%nspin))
  SAFE_ALLOCATE(achcorbis, (sig%ndiag+sig%noffdiag,sig%nspin))
  SAFE_ALLOCATE(asig_imagbis, (sig%ndiag+sig%noffdiag,sig%nspin))
  if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
    if (sig%fdf .eq. -3) then
      SAFE_ALLOCATE(asx, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
      SAFE_ALLOCATE(ach, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
      SAFE_ALLOCATE(asxbis, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
      SAFE_ALLOCATE(achbis, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    else
      SAFE_ALLOCATE(asx, (3,sig%ndiag+sig%noffdiag,sig%nspin))
      SAFE_ALLOCATE(ach, (3,sig%ndiag+sig%noffdiag,sig%nspin))
      SAFE_ALLOCATE(asxbis, (3,sig%ndiag+sig%noffdiag,sig%nspin))
      SAFE_ALLOCATE(achbis, (3,sig%ndiag+sig%noffdiag,sig%nspin))
    endif
    SAFE_ALLOCATE(asig, (sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(acht_n1, (sig%ntband))
    SAFE_ALLOCATE(ach_n1q, (sig%ntband,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(ach_n1, (sig%ntband,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(enew, (sig%ndiag,sig%nspin))
    SAFE_ALLOCATE(efsto, (sig%ndiag,sig%nspin))
    SAFE_ALLOCATE(zrenorm, (sig%ndiag,sig%nspin))
    if (sig%fdf.eq.-3) then
      nfreqgpp=sig%nfreqeval
      SAFE_ALLOCATE(asxt, (sig%nfreqeval))
      SAFE_ALLOCATE(acht, (sig%nfreqeval))
    else
      nfreqgpp=3
      SAFE_ALLOCATE(asxt, (3))
      SAFE_ALLOCATE(acht, (3))
    endif
  endif
  if (sig%freq_dep.eq.2) then
    SAFE_ALLOCATE(asxDyn, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achDyn, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achDyn_cor, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achDyn_corb, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(ach2Dyn, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(asigDyn, (sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achtD_n1, (sig%ntband))
    SAFE_ALLOCATE(achD_n1q, (sig%ntband,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achD_n1, (sig%ntband,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(asxDbis, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achDbis, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achDbis_cor, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achDbis_corb, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(ach2Dbis, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(efstoDyn, (sig%ndiag,sig%nspin))
    SAFE_ALLOCATE(asxtDyn, (sig%nfreqeval))
    SAFE_ALLOCATE(achtDyn, (sig%nfreqeval))
    SAFE_ALLOCATE(achtDyn_cor, (sig%nfreqeval))
    SAFE_ALLOCATE(achtDyn_corb, (sig%nfreqeval))
    SAFE_ALLOCATE(ach2tDyn, (sig%nfreqeval))
  endif
  SAFE_ALLOCATE(isrtrq, (gvec%ng))
  if (sig%freq_dep.eq.0.or.sig%exact_ch.eq.1) then
    SAFE_ALLOCATE(isrtrqi, (gvec%ng))
  endif
  SAFE_ALLOCATE(ekin, (gvec%ng))
  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
    SAFE_ALLOCATE(isrtq, (gvec%ng))
    SAFE_ALLOCATE(isrtqi, (gvec%ng))
    SAFE_ALLOCATE(ind, (gvec%ng))
    SAFE_ALLOCATE(indinv, (gvec%ng))
    SAFE_ALLOCATE(ph, (gvec%ng))
    if(sig%iwriteint.eq.1) then
      if (peinf%inode .eq. 0) then
        SAFE_ALLOCATE(epsmpi%isrtq, (gvec%ng,sig%nq))
        SAFE_ALLOCATE(epsmpi%isrtqi, (gvec%ng,sig%nq))
      endif
      SAFE_ALLOCATE(epsmpi%qk, (3,sig%nq))
      SAFE_ALLOCATE(epsmpi%nmtx, (sig%nq))
      SAFE_ALLOCATE(epsmpi%igp_owner, (neps))
      SAFE_ALLOCATE(epsmpi%igp_index, (neps))
      epsmpi%ngpown_max = neps/peinf%npes_pool
      epsmpi%ngpown_rem = mod(neps,peinf%npes_pool)
      if (mod(neps,peinf%npes_pool) .ne. 0) epsmpi%ngpown_max = epsmpi%ngpown_max + 1
      SAFE_ALLOCATE(epsmpi%inv_igp_index, (epsmpi%ngpown_max))
    endif
  endif
  ngpown_max = neps/peinf%npes_pool
  if (mod(neps,peinf%npes_pool) .ne. 0) ngpown_max = ngpown_max + 1
  ngpown_rem = mod(neps,peinf%npes_pool)

  if (peinf%inode.eq.0) call timacc(59,2)
 
!----------------------------
! Read eps^-1 from eps0mat/epsmat
!
! JRD: The matrices are read in from eps0mat/epsmat files and writen
! to temporary INT_EPS files on unit iunit_eps. The q->0 matrix is not
! symmetrized. The wavevector q is also read from subroutine epscopy.

  if (peinf%inode.eq.0) call timacc(3,1)
  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
    call logit('Calling epscopy')
    if (sig%iwriteint.eq.0) then
      
      call epscopy(crys,gvec,sig,neps,epsmpi,epshead,iunit_eps,fne)
      
    else if (sig%iwriteint.eq.1) then
      
! JRD: Distribute epsilon over procs in pool

      icurr=0
      epsmpi%igp_owner=-1
      epsmpi%igp_index=0
      
      do ijk = 1,neps
! JRD: I thought we could improve performance by having each own blocks for contiguous igp
! This turns out to be a bad idea because it leads poor load balancing when ggpsum=1 
! (half matrix)
        epsmpi%igp_owner(ijk)=mod(ijk-1,peinf%npes_pool) 
        !if (ijk .le. epsmpi%ngpown_max * epsmpi%ngpown_rem) then
        !  epsmpi%igp_owner(ijk) = (ijk-1) / epsmpi%ngpown_max
        !else
        !  epsmpi%igp_owner(ijk) = ((ijk - 1 - epsmpi%ngpown_max * epsmpi%ngpown_rem) / &
        !      (epsmpi%ngpown_max - 1)) + epsmpi%ngpown_rem
        !endif
        if (peinf%pool_rank .eq. epsmpi%igp_owner(ijk)) then
          icurr=icurr+1
          epsmpi%igp_index(ijk)=icurr
          epsmpi%inv_igp_index(icurr)=ijk
        endif
      enddo
      
      epsmpi%ngpown=icurr
      if (epsmpi%ngpown .gt. epsmpi%ngpown_max) call die('Inconsistency in ngpown/ngpownmax')

      call epscopy(crys,gvec,sig,neps,epsmpi,epshead,iunit_eps,fne)

    endif  ! if (sig%iwriteint.eq.1)
  endif  ! if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2)
  if (peinf%inode.eq.0) call timacc(3,2)

!----------------------------
! Generate full Brillouin zone from irreducible wedge q -> gr%f

#ifdef VERBOSE
  if (peinf%inode.eq.0) then
    write(6,*)'*** VERBOSE: nq, syms%ntran= ',sig%nq,syms%ntran
  endif
#endif
  
  if (peinf%inode.eq.0) call timacc(4,1)
  gr%nr = sig%nq
  SAFE_ALLOCATE(gr%r, (3, sig%nq))
  gr%r(1:3,1:sig%nq) = sig%qpt(1:3,1:sig%nq)
  call fullbz(crys,syms,gr,syms%ntran,skip_checkbz,wigner_seitz=.false.,paranoid=.true.,nfix=sig%nq0)
  qshift(:)=0.0d0
  if (sig%freq_dep.eq.-1) then
    ! for Hartree-Fock, there is no epsmat/eps0mat file
    tmpfn="sigma.inp"
  else
    if (sig%igamma.ne.0) then
      tmpfn='eps0mat'
    else
      tmpfn='epsmat'
    endif
  endif
  if (.not. skip_checkbz) then
    !FHJ: TODO: ignore change checkbz to support nq0
    !call checkbz(gr%nf,gr%f,sig%qgrid,qshift,crys%bdot,tmpfn,'q',.false.,sig%freplacebz,sig%fwritebz,nfix=sig%nq0)
    call checkbz(gr%nf,gr%f,sig%qgrid,qshift,crys%bdot,tmpfn,'q',.false.,sig%freplacebz,sig%fwritebz)
  endif
  if (peinf%inode.eq.0) call timacc(4,2)
  

!-------- Start computation of sigma operator ----------------------------------

  
  fact = 1D0/(dble(gr%nf-sig%nq0+1)*crys%celvol)
  coulfact = 8D0*PI_D/(dble(gr%nf-sig%nq0+1)*crys%celvol)

!----------------------------
! Initialize distribution of epsilon

  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2.or.sig%freq_dep==3) then
    
#ifdef VERBOSE
    call logit('Allocating eps')
    if (peinf%inode.eq.0) then
      write(6,*) '*** VERBOSE: size of eps, neps=',neps
    endif
#endif

! JRD: DISTRIBUTE TO PROCS IN POOLS
! XXX: We do identical thing above in epsmpi
!    : except inv_igp_index allocated of different sizes

    SAFE_ALLOCATE(igp_owner, (neps))
    SAFE_ALLOCATE(igp_index, (neps))
    SAFE_ALLOCATE(inv_igp_index, (neps))
    
    icurr=0
    igp_owner=-1
    igp_index=0
    inv_igp_index=0
    
    do ijk = 1,neps
! JRD: I thought we could improve performance by having each own blocks for contiguous igp
! This turns out to be a bad idea because it leads poor load balancing when ggpsum=1 
! (half matrix)
      igp_owner(ijk)=mod(ijk-1,peinf%npes_pool)
      !if (ijk .le. ngpown_max * ngpown_rem) then
      !  igp_owner(ijk) = (ijk-1) / ngpown_max
      !else
      !  igp_owner(ijk) = ((ijk - 1 - ngpown_max * ngpown_rem) / &
      !      (ngpown_max - 1)) + ngpown_rem
      !endif

      if (peinf%pool_rank .eq. igp_owner(ijk)) then
        icurr=icurr+1
        igp_index(ijk)=icurr
        inv_igp_index(icurr)=ijk
      endif
    enddo

    ngpown=icurr

  endif

  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
    SAFE_ALLOCATE(eps, (neps,ngpown))
    SAFE_ALLOCATE(epstemp, (neps))
  endif
  if (sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
    SAFE_ALLOCATE(epsR, (sig%nFreq,neps,ngpown))
    SAFE_ALLOCATE(epsRtemp, (sig%nFreq,neps))
#ifdef CPLX
    SAFE_ALLOCATE(epsA, (sig%nFreq,neps,ngpown))
    SAFE_ALLOCATE(epsAtemp, (sig%nFreq,neps))
#endif
  endif


!---------- Check grid for uniformity

  if(peinf%inode == 0) call checkgriduniformity(sig%qgrid, crys, sig%icutv)

!-------- Loop over kpoints rkn-------------------------------------------------


  do ika=1,sig%nkn
!----------------------------
! Read wavefunctions for rkn (if sig%nkn.gt.1)
! (else it is already in wfnk)
    ikn = sig%indkn(ika)
    if (peinf%inode.eq.0) call timacc(19,1)
    
    if(sig%nkn.gt.1) then
      if (sig%iwriteint .eq. 0) then
        if (mod(peinf%inode,peinf%npes/peinf%npools).eq.0) then
          call open_file(iunit_k,file=fnk,form='unformatted',status='old')
          
          found = .false.
          do im=1,sig%nkn
            read(iunit_k) wfnk%nkpt,wfnk%ndv,nbandi,(wfnk%isrtk(j),j=1,gvec%ng),(qk(i),i=1,3)
            
! Check k-point
            if(all(abs(kp%rk(1:3,ikn)-qk(1:3)) .lt. TOL_Small)) then
              found = .true.
              in=im
              ndv_ikn=wfnk%ndv
            endif
          enddo

          if(.not. found) then
            call die('sigma main: k-point mismatch')
          endif
          
          ! why are we reading this file a second time?? --DAS
          SAFE_ALLOCATE(wfnk%zk, (ndv_ikn,sig%nspin*kp%nspinor))
          call close_file(iunit_k)
          call open_file(iunit_k,file=fnk,form='unformatted',status='old')
          do im=1,in-1
            read(iunit_k)
          enddo
          
          read(iunit_k) wfnk%nkpt,wfnk%ndv,nbandi,(wfnk%isrtk(j),j=1,gvec%ng),(qk(i),i=1,3), &
            ((wfnk%ek(j,k),j=1,nbandi),k=1,sig%nspin), &
            ((wfnk%elda(j,k),j=1,nbandi),k=1,sig%nspin), &
            ((wfnk%zk(j,k),j=1,ndv_ikn),k=1,sig%nspin*kp%nspinor)
          call close_file(iunit_k)
        endif
        
      else ! sig%iwriteint .eq. 0
        
        nbandi=sig%ntband
        wfnk%nkpt=wfnkmpi%nkptotal(ika)
        wfnk%ndv=peinf%ndiag_max*wfnk%nkpt
        wfnk%isrtk(1:wfnk%nkpt)=wfnkmpi%isort(1:wfnk%nkpt,ika)
        qk(1:3)=wfnkmpi%qk(1:3,ika)
        wfnk%ek(1:sig%ntband,1:sig%nspin) = wfnkmpi%el(1:sig%ntband,1:sig%nspin,ika)
        wfnk%elda(1:sig%ntband,1:sig%nspin) = wfnkmpi%elda(1:sig%ntband,1:sig%nspin,ika)
        SAFE_ALLOCATE(wfnk%zk, (wfnk%ndv,sig%nspin*kp%nspinor))
        wfnk%zk=ZERO
        do k=1,sig%nspin*kp%nspinor
#ifdef MPI
          tag=1024
          if (mod(wfnk%ndv,peinf%npes/peinf%npools).eq.0) then
            jj=wfnk%ndv/(peinf%npes/peinf%npools)
          else
            jj=wfnk%ndv/(peinf%npes/peinf%npools)+1
          endif
          do j=1,peinf%npools
            dest=(j-1)*(peinf%npes/peinf%npools)
            if (peinf%inode.eq.dest) then
              g1=1
              g2=min(jj,wfnk%ndv)
              if (g2.ge.g1) then
                wfnk%zk(g1:g2,k)=wfnkmpi%cg(g1:g2,k,ika)
              endif ! g2.ge.g1
            endif
            do i=2,peinf%npes/peinf%npools
              source=(i-1)+dest
              g1=1+(i-1)*jj
              g2=min(jj+(i-1)*jj,wfnk%ndv)
              if (g2.ge.g1) then
                if (peinf%inode.eq.source) &
                  call MPI_Send(wfnkmpi%cg(1,k,ika),g2-g1+1,MPI_SCALAR,dest,tag,MPI_COMM_WORLD,mpierr)
                if (peinf%inode.eq.dest) &
                  call MPI_Recv(wfnk%zk(g1,k),g2-g1+1,MPI_SCALAR,source,tag,MPI_COMM_WORLD,mpistatus,mpierr)
              endif ! g2.ge.g1
            enddo ! i=2,peinf%npes/peinf%npools
          enddo ! j=1,peinf%npools
#else
          wfnk%zk(1:wfnk%ndv,k)=wfnkmpi%cg(1:wfnk%ndv,k,ika)
#endif
        enddo ! k=1,sig%nspin
      endif ! sig%iwriteint .eq. 0

#ifdef MPI
      call MPI_Bcast(wfnk%nkpt,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
      call MPI_Bcast(wfnk%ndv,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
      call MPI_Bcast(nbandi,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
      call MPI_Bcast(wfnk%isrtk,gvec%ng,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
      call MPI_Bcast(qk,3,MPI_REAL_DP,0,MPI_COMM_WORLD,mpierr)
      call MPI_Bcast(wfnk%ek,nbandi*sig%nspin,MPI_REAL_DP,0,MPI_COMM_WORLD,mpierr)
      call MPI_Bcast(wfnk%elda,nbandi*sig%nspin,MPI_REAL_DP,0,MPI_COMM_WORLD,mpierr)

      if (sig%iwriteint .eq. 0) then
        if (mod(peinf%inode,peinf%npes/peinf%npools).ne.0) then
          SAFE_ALLOCATE(wfnk%zk, (wfnk%ndv,sig%nspin*kp%nspinor))
        endif
      endif
      
      if (peinf%npools.eq.1) then
        call MPI_Bcast(wfnk%zk,wfnk%ndv*sig%nspin*kp%nspinor,MPI_SCALAR,0,MPI_COMM_WORLD,mpierr)
      else
        tag=1024
        do j=1,peinf%npools
          source=(j-1)*(peinf%npes/peinf%npools)
          do i=2,peinf%npes/peinf%npools
            dest=(i-1)+source
            if (peinf%inode.eq.source) &
              call MPI_Send(wfnk%zk,wfnk%ndv*sig%nspin,MPI_SCALAR,dest,tag,MPI_COMM_WORLD,mpierr)
            if (peinf%inode.eq.dest) &
              call MPI_Recv(wfnk%zk,wfnk%ndv*sig%nspin,MPI_SCALAR,source,tag,MPI_COMM_WORLD,mpistatus,mpierr)
          enddo
        enddo
      endif
#endif
    endif ! sig%nkn.gt.1
    if (peinf%inode.eq.0) call timacc(19,2)
    
    if(peinf%inode.eq.0) then
      write(6,*)
      call print_dealing_with(ika, sig%nkn, kp%rk(:,ikn), 'k')
    endif
    
!----------------------------
! Initialize Matrix Elements

    alda=0.0d0
    ax=0.0d0
    axbis=0.0d0
    achcor(:,:)=(0.0d0,0.0d0)
    asig_imag(:,:)=(0.0d0,0.0d0)
    achcorbis(:,:)=(0.0d0,0.0d0)
    asig_imagbis(:,:)=(0.0d0,0.0d0)
    if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
      asx(:,:,:)=ZERO
      ach(:,:,:)=ZERO
      asig(:,:)=ZERO
      ach_n1q(:,:,:)=ZERO
      ach_n1(:,:,:)=ZERO
      asxbis(:,:,:)=ZERO
      achbis(:,:,:)=ZERO
    endif
    if (sig%freq_dep.eq.2) then
      asxDyn(:,:,:)=(0.0d0,0.0d0)
      achDyn(:,:,:)=(0.0d0,0.0d0)
      achDyn_cor(:,:,:)=(0.0d0,0.0d0)
      achDyn_corb(:,:,:)=(0.0d0,0.0d0)
      ach2Dyn(:,:,:)=(0.0d0,0.0d0)
      asigDyn(:,:)=(0.0d0,0.0d0)
      achD_n1q(:,:,:)=(0.0d0,0.0d0)
      achD_n1(:,:,:)=(0.0d0,0.0d0)
      asxDbis(:,:,:)=(0.0d0,0.0d0)
      achDbis(:,:,:)=(0.0d0,0.0d0)
      achDbis_cor(:,:,:)=(0.0d0,0.0d0)
      achDbis_corb(:,:,:)=(0.0d0,0.0d0)
      ach2Dbis(:,:,:)=(0.0d0,0.0d0)
    endif
    
!----------------------------
! Read matrix elements of Vxc from file vxc.dat
! or compute them on the fly from Vxc potential

    if (peinf%inode.eq.0) call timacc(5,1)
    if(sig%use_vxcdat) then
      if(peinf%inode == 0) write(6,*) 'Reading vxc.dat'
      call open_file(120,file='vxc.dat',form='formatted',status='old')
      qk(:)=INF
      ierr=0
      do while (ierr.eq.0)
        call read_matrix_elements_type(120, ierr, qk, sig, alda)
        if (all(abs(kp%rk(1:3,ikn)-qk(1:3)) .lt. TOL_Small)) exit
      enddo
      call close_file(120)
          
! Check k-point

      if(any(abs(kp%rk(1:3,ikn)-qk(1:3)) .ge. TOL_Small)) then
        call die('cannot find k-point in vxc.dat', only_root_writes = .true.)
      endif

! Divide by ryd for diag
! this will be undone by shift_energy routines later

      do s2=1,sig%nspin
        do in=1,sig%ndiag
          alda(in,s2) = alda(in,s2)/ryd
        enddo
      enddo

    else ! not using vxc.dat
      
      call logit('Calling mtxel_vxc')
      call mtxel_vxc(kp,gvec,sig,wfnk,wfnkoff,alda,.false.)

    endif


    if (peinf%inode.eq.0) call timacc(5,2)
    
#ifdef CPLX
    if (any(abs(IMAG(alda(1:sig%ndiag,:)))*ryd > 1.0d-4)) imagvxcflag=.true.
#endif

!----------------------------
! Read ax from existing data

    if(sig%use_xdat .and. xflag .and. (.not. sig%coul_mod_flag)) then

      if(peinf%inode == 0) write(6,*) 'Reading x.dat'
      call open_file(119,file='x.dat',form='formatted',status='old')
      qk(:)=INF
      ierr=0
      do while (ierr.eq.0)
        call read_matrix_elements_type(119, ierr, qk, sig, ax)
        if (all(abs(kp%rk(1:3,ikn)-qk(1:3)) .lt. TOL_Small)) exit
      enddo
      ax(:,:) = ax(:,:) * sig%xfrac
      call close_file(119)
      
! Check k-point

      if(any(abs(kp%rk(1:3,ikn)-qk(1:3)) .ge. TOL_Small)) then
        call die('cannot find k-point in x.dat', only_root_writes = .true.)
      endif

! Divide by ryd for diag

      do s2=1,sig%nspin
        do in=1,sig%ndiag
          ax(in,s2) = ax(in,s2)/ryd
        enddo
      enddo

#ifdef CPLX
    if (any(abs(IMAG(ax(1:sig%ndiag,:)))*ryd > 1.0d-4)) imagxflag=.true.
#endif

    endif ! using x.dat

!----------------------------
! Find subgroup which leaves kn invariant
! Indices of group operations in subgroup stored in array indsub
! stored in structure syms

    qk(:) = kp%rk(:,ikn)
    if (peinf%inode.eq.0) call timacc(6,1)
    if (sig%qgridsym) then
      call subgrp(qk,syms)
    else
      syms%ntranq=1
      syms%indsub(1)=1
      syms%kgzero(1:3,1)=0
    endif
    if (peinf%inode.eq.0) call timacc(6,2)

!----------------------------
! Reduce qpoints with respect to group of kn
! Keep track of number of q-points equivalent
! to give qpoint in irr-bz neq(irq)
!
! Keep track of q-point in the set given in epsmat
! RQ = R(Q) + G0 with transformation R and umklapp G0
! In order to unfold the inverse dielectric matrix from Q to RQ
!
! The q-points are the points for which we have epsilon
! (should be the unshifted irreducible grid)

    SAFE_ALLOCATE(indrq, (gr%nf))
    SAFE_ALLOCATE(neq, (gr%nf))
    SAFE_ALLOCATE(itnrq, (gr%nf))
    SAFE_ALLOCATE(rq, (3,gr%nf))
    SAFE_ALLOCATE(kg0, (3,gr%nf))
    if (peinf%inode.eq.0) call timacc(7,1)
    call irrbz(syms,gr%nf,gr%f,nrq,neq,indrq,rq,sig%nq,sig%qpt,itnrq,kg0,nfix=sig%nq0)
    if (peinf%inode.eq.0) call timacc(7,2)
    

!!---------- Loop over k-points in irr bz with respect to kn (rq) --------------

    irq_min = 1
    call progress_init(prog_info, 'calculating Sigma', 'block', &
      (nrq-irq_min+1)*(peinf%ndiag_max+peinf%noffdiag_max)*sig%nspin)

    do irq_ = irq_min, nrq
      irq = irq_
      is_subq = .false.

#ifdef VERBOSE
      if(peinf%inode.eq.0) then
        write(6,60) irq,nrq
        write(6,*) ' '
      endif
60    format(/,3x,'qpoint',i5,' out of ',i5)
#endif

! Compute energies: |q+g|**2
      if (is_subq) then
        call kinetic_energies(gvec, crys%bdot, ekin)
      else
        call kinetic_energies(gvec, crys%bdot, ekin, qvec = rq(1:3, irq))
      endif

! Sort ekin in ascending order for this q
! The indices are placed in array isrtrq
      call sortrx(gvec%ng, ekin, isrtrq, gvec = gvec%components)
      if ((sig%freq_dep.eq.0.or.sig%exact_ch.eq.1).and.irq_==irq_min) then
        isrtrqi=0
        do j=1,gvec%ng
          if (isrtrq(j).ge.1.and.isrtrq(j).le.gvec%ng) &
            isrtrqi(isrtrq(j))=j
        enddo
      endif

! Compute cutoff in sums over G,G`
      ncoulb = gcutoff(gvec%ng, ekin, isrtrq, sig%ecutb)
      ncouls = gcutoff(gvec%ng, ekin, isrtrq, sig%ecuts)
      ncoul = max(ncouls,ncoulb)

      if (sig%freq_dep.eq.0.or.sig%exact_ch.eq.1) then
        if(irq_==irq_min) ncoulch = ncoul
      else
        ncoulch = 0
      endif

      ! this condition is assumed later in the code (e.g. wpeff array sizes), so we must check it
      if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
        if (ncouls.gt.neps) then
          write(tmpstr,'(a,i6,a,i6)')&
            "screened Coulomb cutoff is bigger than epsilon cutoff"//CHAR(10)&
            //' ncouls = ',ncouls,' neps = ',neps
          ! NB: CHAR(10) is the carriage return.
          call die(tmpstr, only_root_writes = .true.)
        endif
      endif

!----------------------------
! Allocate arrays for q-point rq(:,irq)
      SAFE_ALLOCATE(vcoul, (ncoul))
      SAFE_ALLOCATE(aqs, (ncoul,peinf%ntband_max))
      if (sig%noffdiag.gt.0) then 
        SAFE_ALLOCATE(aqsaug, (ncoul,peinf%ntband_max,sig%ndiag,sig%nspin))
      endif
      if ((sig%freq_dep.eq.0.or.sig%exact_ch.eq.1).and.irq_==irq_min) then
        SAFE_ALLOCATE(aqsch, (ncoulch))
        if (nrq.gt.1) then
          SAFE_ALLOCATE(aqsaugchd, (ncoulch,peinf%ndiag_max,sig%nspin))
          if (sig%noffdiag.gt.0) then
            SAFE_ALLOCATE(aqsaugcho, (ncoulch,peinf%noffdiag_max,sig%nspin))
          end if
        endif
      endif
      nm = indrq(irq)


!!!------- Calculate Vcoul -----------------------------------------------------
      if (peinf%inode.eq.0) call timacc(13,1)
      vq=rq(:,irq)
      qlen = sqrt(DOT_PRODUCT(vq,MATMUL(crys%bdot,vq)))
      if(sig%freq_dep /= -1) call check_screening_trunc(sig%icutv,sig%iscreen,sig%q0vec,crys%bdot)
      iparallel=1
      avgcut = sig%avgcut
      if (.not. sig%coul_mod_flag) then
        call vcoul_generator(sig%icutv,sig%truncval,gvec,crys%bdot, &
          gr%nf-sig%nq0+1,ncoul,isrtrq,sig%iscreen,vq,sig%q0vec,vcoul, &
          sig%iwritecoul,iparallel,avgcut,oneoverq,sig%qgrid,epshead, &
          work_scell,sig%averagew,sig%wcoul0)
      endif

      fact = 1D0/(dble(gr%nf-sig%nq0+1)*crys%celvol)
      coulfact = 8D0*PI_D/(dble(gr%nf-sig%nq0+1)*crys%celvol)
      do ig = 1, ncoul
        vcoul(ig)=fact*vcoul(ig)
      enddo
      if (ika.eq.1.and.irq_==irq_min) then
        sig%wcoul0 = sig%wcoul0 * fact
      endif
      if (peinf%inode.eq.0) call timacc(13,2)
      

!!!------- Read inverse dielectric matrix for q-point rq(:,irq) ----------------
      if (peinf%inode.eq.0) call timacc(14,1)
      
      if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
        
        q0len = sqrt(DOT_PRODUCT(sig%q0vec,MATMUL(crys%bdot,sig%q0vec)))
        
!----------------------------
! Processor 0 read eps^-1 for this q and broadcast to others
!
! Note: the total number of g-vectors used during
! computation of the inverse dielectric matrix
! may be different than in present calculation
! although the sets must coincide for small g
!
! JRD: We must find correct spot in EPS File
! Possible Performance Hazard
! Speed up might be breaking files up
!
! CHP: Now this hazard is gone with comm_mpi

        if(sig%iwriteint.eq.0) then
          
          if (peinf%inode.eq.0) then
            call open_file(iunit_eps,file=fne,form='unformatted',status='old')
              
! CHP: "if(nm.ne.1) then" is NOT necessary since we have "do ii=1,nm-1"
!                if (nm .ne. 1) then
            do ii=1,nm-1
              read(iunit_eps) ngqt,nmtxt
              if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
                do j=1,nmtxt
                  read(iunit_eps)
                enddo
              else ! sig%freq_dep.eq.2
                do j=1,nmtxt
                  do i=1,nmtxt
                    read(iunit_eps) ! epsR
                  enddo
#ifdef CPLX
                  do i=1,nmtxt
                    read(iunit_eps) ! epsA
                  enddo
#endif
                enddo
              endif
            enddo
!                endif ! if (nm .ne. 1)

            read(iunit_eps) ngq,nmtx,isrtq(1:ngq),isrtqi(1:ngq),(qk(i),i=1,3)
          endif ! proc 0

#ifdef MPI
          call MPI_Bcast(ngq, 1, MPI_INTEGER, 0, MPI_COMM_WORLD,mpierr)
          call MPI_Bcast(nmtx, 1, MPI_INTEGER, 0, MPI_COMM_WORLD,mpierr)
          call MPI_Bcast(isrtq, ngq, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
          call MPI_Bcast(isrtqi, ngq, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
          call MPI_Bcast(qk, 3, MPI_REAL_DP, 0, MPI_COMM_WORLD, mpierr)
#endif
        else if (sig%iwriteint.eq.1) then
          ngq = gvec%ng
          nmtx = epsmpi%nmtx(nm)

          if (peinf%inode .eq. 0) then
            isrtq(:) = epsmpi%isrtq(:,nm)
            isrtqi(:) = epsmpi%isrtqi(:,nm)
          endif
#ifdef MPI
          call MPI_Bcast(isrtq, ngq, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
          call MPI_Bcast(isrtqi, ngq, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
#endif
          qk(:) = epsmpi%qk(:,nm)
        endif

! Find g=0 in main gvec list and eps gvector list

        ! write explicitly to avoid possible warning about array temporary
        nullvec(1:3) = 0
        call findvector(iout,nullvec,gvec)
        iout = isrtqi(iout)
          
#ifdef VERBOSE
        if (peinf%inode.eq.0) write(6,*) 'Reading Eps Back'
#endif
        if (sig%iwriteint.eq.0) then
          
          do ijk = 1, nmtx

            if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
              epstemp=0D0
            else ! sig%freq_dep.eq.2 .or. 3
              epsRtemp=0D0
#ifdef CPLX
              epsAtemp=0D0
#endif
            endif
            
            if (peinf%inode .eq. 0) then
              
              if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
                read(iunit_eps) (epstemp(i),i=1,nmtx)
              else ! sig%freq_dep.eq.2 .or. 3
                do i=1,nmtx
                  read(iunit_eps) (epsRtemp(jj,i),jj=1,sig%nFreq)
                enddo
#ifdef CPLX
                do i=1,nmtx
                  read(iunit_eps) (epsAtemp(jj,i),jj=1,sig%nFreq)
                enddo
#endif
              endif
            endif

!-------------------------------------------------------------------------------
! Now distribute it

! JRD: Possible time Hazard.  Can we do this all in epscopy once and for all?
! CHP: Yes, use comm_mpi!

! The Broadcast here is bad. But, that is the price you pay for comm_disk...

#ifdef MPI
            dest=igp_owner(ijk)
            if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
              call MPI_Bcast(epstemp,neps,MPI_SCALAR,0,MPI_COMM_WORLD,mpierr)
            else ! sig%freq_dep.eq.2 .or. 3
              call MPI_Bcast(epsRtemp,sig%nFreq*neps,MPI_COMPLEX_DPC,0,MPI_COMM_WORLD,mpierr)
#ifdef CPLX
              call MPI_Bcast(epsAtemp,sig%nFreq*neps,MPI_COMPLEX_DPC,0,MPI_COMM_WORLD,mpierr)
#endif
            endif
#endif
              
            if (peinf%pool_rank .eq. igp_owner(ijk)) then
              if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
                eps(:,igp_index(ijk))=epstemp(:)
              else !  sig%freq_dep.eq.2 .or. 3
                epsR(:,:,igp_index(ijk))=epsRtemp(:,:)
#ifdef CPLX
                epsA(:,:,igp_index(ijk))=epsAtemp(:,:)
#endif
              endif
#ifdef VERBOSE
699           format(1x,"WARNING: distribution of inverse epsilon matrix", &
                /,3x,"inode =",i5,1x,"ijk =",i6,1x,"igp_index(ijk) =",i2,/)
              if (igp_index(ijk) .eq. 0) then
                write(0,699) peinf%inode, ijk, igp_index(ijk)
              endif
#endif
            endif ! proc non-0
            
          enddo  ! ijk
        endif  ! if (sig%iwriteint.eq.0)

! JRD: XXX This is sort of a waste of memory... Can we use pointers for this sort of thing?
        
        if (sig%iwriteint.eq.1) then
          if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
            eps(:,:)=epsmpi%eps(:,:,nm)
          else ! sig%freq_dep.eq.2 .or. 3
            epsR(:,:,:)=epsmpi%epsR(:,:,:,nm)
#ifdef CPLX
            epsA(:,:,:)=epsmpi%epsA(:,:,:,nm)
#endif
          endif
        endif ! if (sig%iwriteint.eq.1)
        
! CHP: By first distributing and then doing the wing fix, we can let
!      all the processors work together, thus, saving some time.
!      This is the case for both comm_mpi and comm_disk.

!-------------------------------------------------------------------------------
! Fix wing divergence for semiconductors and graphene

! This should really be done for all "|q+G| < avgcut" - but for now,
! it is done if "|q| < avgcut and G=0"

        if (.not.sig%subsample) then
        do ijk = 1, nmtx
          
          if (peinf%pool_rank.ne.igp_owner(ijk)) cycle
          
          if (igp_index(ijk).eq.0) write(6,*) 'ijk!!',peinf%inode,ijk
          
          if (nm .eq. 1) then
            q0flag=.true.
            if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
              call fixwings(vcoul(1),sig%wcoul0,eps(:,igp_index(ijk)),sig%icutv, &
                sig%iscreen,ijk,nmtx,iout,q0len,oneoverq,fact,q0flag,sig%averagew,crys%bdot)
            else ! sig%freq_dep.eq.2 .or. 3
              call fixwings_dyn(vcoul(1),epsR(:,:,igp_index(ijk)), &
                sig%icutv,sig%iscreen,ijk,sig%nFreq,nmtx,iout,q0len,oneoverq,fact,q0flag,crys%bdot)
#ifdef CPLX
              call fixwings_dyn(vcoul(1),epsA(:,:,igp_index(ijk)), &
                sig%icutv,sig%iscreen,ijk,sig%nFreq,nmtx,iout,q0len,oneoverq,fact,q0flag,crys%bdot)
#endif
            endif
          else if (qlen**2 .lt. sig%avgcut) then
            q0flag=.false.
            if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
              call fixwings(vcoul(1),sig%wcoul0,eps(:,igp_index(ijk)),sig%icutv, &
                sig%iscreen,ijk,nmtx,iout,qlen,oneoverq,fact,q0flag,sig%averagew,crys%bdot)
            else ! sig%freq_dep.eq.2 .or. 3
              call fixwings_dyn(vcoul(1),epsR(:,:,igp_index(ijk)), &
                sig%icutv,sig%iscreen,ijk,sig%nFreq,nmtx,iout,qlen,oneoverq,fact,q0flag,crys%bdot)
#ifdef CPLX
              call fixwings_dyn(vcoul(1),epsA(:,:,igp_index(ijk)), &
                sig%icutv,sig%iscreen,ijk,sig%nFreq,nmtx,iout,qlen,oneoverq,fact,q0flag,crys%bdot)
#endif
            endif
          endif
          
        enddo ! ijk
        endif !.not.sig%subsample
        
        if(sig%iwriteint == 0) then
          call logit('Read eps from disk')
        else
          call logit('Read eps from memory')
        endif

        if (peinf%inode.eq.0 .and. sig%iwriteint == 0) then
          write(6,*) 'Done Read Eps Back'
! use of below causes NaNs from bad optimization, blocked by statement containing NaN below
! bad at -O2, -O3, ok at -O1 and -O0 at least for gfortran. --DAS
          call close_file(iunit_eps)
!          close(iunit_eps)  ! using this line instead has no optimization issue
        endif
        
      endif  ! sig%freq_dep
      
      if (sig%freq_dep.eq.-1) then
        ngq=0
        nmtx=0
        qk(:)=sig%qpt(:,nm)
      endif
      
      if (peinf%inode.eq.0) call timacc(14,2)
      
#ifdef VERBOSE
      if (peinf%inode.eq.0) then
        write(6,'(3(a,i10))') ' *** VERBOSE: nmtx =',nmtx,' ncouls =',ncouls
        write(6,*)
      endif
#endif
      if(nmtx.gt.neps) then
        call die('nmtx.gt.neps')
      endif
        
! Check q-vector

      if(any(abs(sig%qpt(1:3, nm) - qk(1:3)) .gt. TOL_SMALL)) then
        write(0,*) peinf%inode,sig%qpt(:,nm),qk(:)
        call die('q-vector check wrong')
      endif
        
      if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
        if(ncouls .gt. nmtx) ncouls = nmtx
      endif

#ifdef VERBOSE
      if(peinf%inode.eq.0) then
        if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
          write(6,100) (rq(i,irq),i=1,3),ncouls,dble(eps(1,1))
        endif
        if (sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
          write(6,100) (rq(i,irq),i=1,3),ncouls,dble(epsR(1,1,1))
        endif
      endif
100   format(3x,'q=',3f8.5,2x,'n=',i6,2x,'head of epsilon inverse =',f12.6,/)
#endif

! Map g-vectors required for eps**(-1)(r(q)) to those
! for the known matrix eps**(-1)(q) and calculate phases

      itran = itnrq(irq)
      kg(:) = kg0(:,irq)
      if (peinf%inode.eq.0) call timacc(8,1)
      call logit('Calling gmap')
      if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
        ind(:)=0
        indinv(:)=0
        ph(:)=ZERO
        call gmap(gvec,syms,ncouls,itran,kg,isrtrq,isrtqi,ind,ph, &
         sig%die_outside_sphere) ! TROUBLE
        do ig = 1, gvec%ng
          if (ind(ig) .gt. 0 .and. ind(ig) .le. gvec%ng) then
            indinv(ind(ig))=ig
          endif
        enddo
      endif
      if (peinf%inode.eq.0) call timacc(8,2)
      

!!!------- Done reading inverse dielectric matrix for q-point rq(:,irq) --------


!--------------------
! Generate needed wavefunctions for rkq = rkn - rq
! stored in derived type wfnkq

      rkq(1:3) = kp%rk(1:3, ikn) - rq(1:3, irq)
      if (peinf%inode.eq.0) call timacc(9,1)
      call logit('Calling genwf')
      if (.not.is_subq.or.irq_==0) then ! FHJ: subsampling uses the same WFNs and matrix elements
        if (sig%iwriteint .eq. 0) then
          call genwf_disk(rkq,syms,gvec,crys,kp,sig,wfnkq,iunit_c,fnc)
        else
          call genwf_mpi(rkq,syms,gvec,crys,kp,sig,wfnkq,wfnkqmpi)
        endif
      endif
      if (peinf%inode.eq.0) call timacc(9,2)
      
!!-------- Loop over spins for which Sigma is computed -------------------------
      do ispin=1,sig%nspin


!!-------- Loop over bands for which diag Sigma is computed --------------------
! Bands are relabelled according to sig%diag(1:sig%ndiag)
! Loop is parallelized according to peinf%index_diag(1:peinf%ndiag_max)

        do in=1,peinf%ndiag_max
          
          call progress_step(prog_info)
#ifdef VERBOSE
          if (peinf%inode.eq.0) then
            if (peinf%npools.eq.1) then
              write(6,999) peinf%index_diag(in)+(ispin-1)*sig%ndiag, &
                peinf%ndiag_max*peinf%npools*sig%nspin
            else
              write(6,997) peinf%index_diag(in)+(ispin-1)*sig%ndiag, &
                peinf%index_diag(in)+(ispin-1)*sig%ndiag+peinf%npools-1, &
                peinf%ndiag_max*peinf%npools*sig%nspin
            endif
          endif
999       format(1x,"Computing Sigma diag",i4,1x,"of",i4)
997       format(1x,"Computing Sigma diag",i4,1x,"to",i4,1x,"of",i4)
#endif
          write(tmpstr,*) 'Working on band ', sig%diag(peinf%index_diag(in)), ' 1st pool'
          call logit(tmpstr)

!---------------------
! Compute planewave matrix elements of g <n,k|exp{i(q+g).r}|n1,k-q>
! Note: wfnk%zk array keeps only the bands specified in sig%diag(:)
! Must keep track of the right band label

          call logit('Calling mtxel')
          if (peinf%inode.eq.0) call timacc(10,1)
          call mtxel(in,gvec,wfnkq,wfnk,ncoul,isrtrq,aqs,ispin,kp)
          if (peinf%inode.eq.0) call timacc(10,2)
          if (sig%noffdiag.gt.0.and.peinf%flag_diag(in)) then
            do j=1,peinf%ntband_node
              do i=1,ncoul
                aqsaug(i,j,peinf%index_diag(in),ispin)=aqs(i,j)
              enddo
            enddo
          endif
          if (sig%freq_dep.eq.0.or.sig%exact_ch.eq.1) then
            if (irq_==irq_min) then
              call logit('Calling mtxel_ch')
              call timacc(16,1)
! JRD ALL PROCS DO THIS NOW. 
              call mtxel_ch(in,in,gvec,wfnk,ncoulch,isrtrq,aqsch,ispin,kp)
              call timacc(16,2)
              if (nrq.gt.1) aqsaugchd(:,in,ispin)=aqsch(:)
            else
              aqsch(:)=aqsaugchd(:,in,ispin)
            endif
          endif

!---------------------
! Compute diag SX and CH

          if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
            call logit('Calling mtxel_sxch for diagonal matrix elements')
            if (peinf%inode.eq.0) call timacc(11,1)
          
            call mtxel_sxch(peinf%index_diag(in), &
             peinf%index_diag(in),ispin,ncouls,neps,gvec,eps,ph, &
             ind,indinv,isrtrqi,isrtrq,vcoul,crys,sig,wpg,wfnk,wfnkq,ncoulch, &
             aqs,aqs,aqsch,asigt_imag,acht_n1,asxt,acht,achtcor, &
             kp%nspin,rq(:,irq),coulfact, &
             inv_igp_index,ngpown, &
             epsR,epsA,achtD_n1,asxtDyn,achtDyn,achtDyn_cor,achtDyn_corb,ach2tDyn,1)

            if (peinf%inode.eq.0) call timacc(11,2)
          else
            achtcor = ZERO
            asigt_imag = ZERO
          endif
          
          if (peinf%flag_diag(in)) then
            achcor(peinf%index_diag(in),ispin)= &
              achcor(peinf%index_diag(in),ispin)+neq(irq)*achtcor
            asig_imag(peinf%index_diag(in),ispin)= &
              asig_imag(peinf%index_diag(in),ispin)+neq(irq)*asigt_imag
            if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
              do iw=1,nfreqgpp
                asx(iw,peinf%index_diag(in),ispin)= &
                  asx(iw,peinf%index_diag(in),ispin)+neq(irq)*asxt(iw)
                ach(iw,peinf%index_diag(in),ispin)= &
                  ach(iw,peinf%index_diag(in),ispin)+neq(irq)*acht(iw)
              enddo
              do ib=1,sig%ntband
                ach_n1q(ib,peinf%index_diag(in),ispin)= &
                  ach_n1q(ib,peinf%index_diag(in),ispin)+neq(irq)*acht_n1(ib)
              enddo
            endif
            if (sig%freq_dep.eq.2) then
              do iw=1,sig%nfreqeval
                asxDyn(iw,peinf%index_diag(in),ispin)= &
                  asxDyn(iw,peinf%index_diag(in),ispin)+neq(irq)*asxtDyn(iw)
                achDyn(iw,peinf%index_diag(in),ispin)= &
                  achDyn(iw,peinf%index_diag(in),ispin)+neq(irq)*achtDyn(iw)
                achDyn_cor(iw,peinf%index_diag(in),ispin)= &
                  achDyn_cor(iw,peinf%index_diag(in),ispin)+neq(irq)*achtDyn_cor(iw)
                achDyn_corb(iw,peinf%index_diag(in),ispin)= &
                  achDyn_corb(iw,peinf%index_diag(in),ispin)+neq(irq)*achtDyn_corb(iw)
                ach2Dyn(iw,peinf%index_diag(in),ispin)= &
                  ach2Dyn(iw,peinf%index_diag(in),ispin)+neq(irq)*ach2tDyn(iw)
              enddo
              do ib=1,sig%ntband
                achD_n1q(ib,peinf%index_diag(in),ispin)= &
                  achD_n1q(ib,peinf%index_diag(in),ispin)+neq(irq)*achtD_n1(ib)
              enddo
            endif
          endif
          
!---------------------
! Compute diag bare exchange (SKIP this computation if you already know it)

          if (peinf%inode.eq.0) call timacc(18,1)
          if ((.not. sig%use_xdat .and. xflag .and. (.not. sig%coul_mod_flag)) .or. (sig%coul_mod_flag &
            .and. (.not.  sig%use_vxc2dat))) then
            call logit('Computing bare X')
            axt=0.0d0
              
! XXX THREAD?  
            do n1=1,peinf%nvband_node
              tempval = wfnkq%ekq(peinf%indext(n1),ispin) - sig%efermi
              if (tempval < sig%tol) then
                if(abs(tempval) < sig%tol) then
                  occ=0.5  ! Fermi-Dirac distribution = 1/2 at Fermi level
                else
                  occ=1D0
                endif
                do ig=1,ncoulb
                  axt = axt + abs(aqs(ig,n1))**2 * occ * vcoul(ig)
                enddo
              endif
                ! sig%ncrit = 0 and tempval > sig%tol should never happen!
            enddo
            if (peinf%flag_diag(in)) then
              ax(peinf%index_diag(in),ispin)=ax(peinf%index_diag(in),ispin)-neq(irq)*axt*sig%xfrac
            endif
          endif ! not use x.dat
          if (peinf%inode.eq.0) call timacc(18,2)
          
        enddo ! in (loop over bands for which we need diag Sigma)
        
        if (ispin.eq.sig%nspin) then
          if (.not.is_subq.or.irq_==sig%nq0) then
            SAFE_DEALLOCATE_P(wfnkq%zkq)
          endif
          SAFE_DEALLOCATE(aqs)
          if ((sig%freq_dep.eq.0.or.sig%exact_ch.eq.1).and.irq_==nrq) then
            if (nrq.gt.1) then
              SAFE_DEALLOCATE_P(aqsaugchd)
            end if
          endif
        endif
        
!!-------- End diag band loop --------------------------------------------------

! (gsm) begin distributing aqsaug matrix elements for offdiag calculation

! $$$ inefficient communication, this should be rewritten $$$

        if (peinf%inode.eq.0) call timacc(17,1)
#ifdef MPI
        if (sig%noffdiag.gt.0.and.peinf%npools.gt.1) then
          tag=1024
          do in=1,sig%ndiag
            jj=mod(in-1,peinf%npools)+1
            do i=1,peinf%npes/peinf%npools
              source=(i-1)+(jj-1)*(peinf%npes/peinf%npools)
              do j=1,peinf%npools
                dest=(i-1)+(j-1)*(peinf%npes/peinf%npools)
                if (j.ne.jj) then
                  if (peinf%inode.eq.source) &
                    call MPI_Send(aqsaug(:,:,in,ispin),peinf%ntband_max*ncoul,MPI_SCALAR,dest,tag,MPI_COMM_WORLD,mpierr)
                  if (peinf%inode.eq.dest) &
                    call MPI_Recv(aqsaug(:,:,in,ispin),peinf%ntband_max*ncoul,MPI_SCALAR, &
                      source,tag,MPI_COMM_WORLD,mpistatus,mpierr)
                endif
              enddo
            enddo
          enddo
        endif ! sig%noffdiag.gt.0.and.peinf%npools.gt.1
#endif
        if (peinf%inode.eq.0) call timacc(17,2)
        
! (gsm) end distributing aqsaug matrix elements for offdiag calculation

!!-------- Loop over bands for which offdiag Sigma is computed -----------------

! Bands are relabelled according to sig%off*(1:sig%noffdiag)
! Loop is parallelized according to peinf%index_offdiag(1:peinf%noffdiag_max)

        do ioff=1,peinf%noffdiag_max
          
          call progress_step(prog_info)
#ifdef VERBOSE
          if (peinf%inode.eq.0) then
            if (peinf%npools.eq.1) then
              write(6,998) peinf%index_offdiag(ioff)+(ispin-1)*sig%noffdiag, &
                peinf%noffdiag_max*peinf%npools*sig%nspin
            else
              write(6,996) peinf%index_offdiag(ioff)+(ispin-1)*sig%noffdiag, &
                peinf%index_offdiag(ioff)+(ispin-1)*sig%noffdiag+peinf%npools-1, &
                peinf%noffdiag_max*peinf%npools*sig%nspin
            endif
          endif
998       format(1x,"Computing Sigma offdiag",i4,1x,"of",i4)
996       format(1x,"Computing Sigma offdiag",i4,1x,"to",i4,1x,"of",i4)
#endif
          write(tmpstr,'(a,2i6,a)') 'Working on bands ', sig%off1(peinf%index_offdiag(ioff)), &
            sig%off2(peinf%index_offdiag(ioff)), ' 1st pool'
          call logit(tmpstr)

! <n|Sigma|m> = 0 if n and m belong to different irreducible representations
! Even without assigning representations, we can tell they are different if
! the size of the degenerate subspace is different for n and m.
! This saves time and helps enforce symmetry. -- DAS
          bExactlyZero = .false.
          if(.not. sig%wfn_outer_present .and. sig%offdiagsym .and. &
             kp%degeneracy(sig%off1(peinf%index_offdiag(ioff)), ikn, ispin) /= &
             kp%degeneracy(sig%off2(peinf%index_offdiag(ioff)), ikn, ispin)) then
            if (peinf%inode.eq.0) write(6,'(a)') 'Zero by symmetry -- not computing.'
            ! the matrix elements are zeroed at the beginning, and at the end of the loop
            ! so we can just leave them as they are
            ! JRD - We cannot cycle here because other pools may have legitimate work to do
            ! and we need to be around for the communication.  Particularly inside the subroutine
            ! mtxel_sxch. Thus we can`t really save time here. We could still zero out elements 
            ! at bottom of loop if want. I leave it up to DAS.
            !cycle
            bExactlyZero=.true.
          endif
            
!---------------------
! Compute planewave matrix elements for exact static CH

          if (sig%freq_dep.eq.0.or.sig%exact_ch.eq.1) then
            if (irq_==irq_min) then
              if (peinf%inode.eq.0) call timacc(20,1)
              wfnkoff%nkpt=wfnk%nkpt
              if (mod(peinf%inode,peinf%npes/peinf%npools).eq.0) then
                SAFE_ALLOCATE(wfnkoff%isrtk, (gvec%ng))
                wfnkoff%isrtk=wfnk%isrtk
                SAFE_ALLOCATE(wfnkoff%zk, (2*wfnk%nkpt,1))
              endif
                
! (gsm) begin gathering wavefunctions over pools

! $$$ inefficient communication, this should be rewritten $$$

              do jj=1,peinf%npools
                dest=(jj-1)*(peinf%npes/peinf%npools)
                do ii=1,2
                  i=sig%offmap(peinf%index_offdiag(ioff),ii)
#ifdef MPI
                  call MPI_Bcast(i,1,MPI_INTEGER,dest,MPI_COMM_WORLD,mpierr)
#endif
                  j=(i-1)/peinf%npools+1
                  source=mod(i-1,peinf%npools)*(peinf%npes/peinf%npools)
                  if (peinf%inode.eq.source.and.peinf%inode.eq.dest) then
                    do k=1,wfnk%nkpt
                      wfnkoff%zk((ii-1)*wfnk%nkpt+k,1) = wfnk%zk((j-1)*wfnk%nkpt+k,ispin)
                    enddo
                  else
#ifdef MPI
                    tag=1024
                    if (peinf%inode.eq.source) &
                      call MPI_Send(wfnk%zk((j-1)*wfnk%nkpt+1,ispin),wfnk%nkpt, &
                        MPI_SCALAR,dest,tag,MPI_COMM_WORLD,mpierr)
                    if (peinf%inode.eq.dest) &
                      call MPI_Recv(wfnkoff%zk((ii-1)*wfnk%nkpt+1,1),wfnk%nkpt, &
                        MPI_SCALAR,source,tag,MPI_COMM_WORLD,mpistatus,mpierr)
#else
                    do k=1,wfnk%nkpt
                      wfnkoff%zk((ii-1)*wfnk%nkpt+k,1) = wfnk%zk((j-1)*wfnk%nkpt+k,ispin)
                    enddo
#endif
                  endif
                enddo
              enddo
              
! (gsm) end gathering wavefunctions over pools

              if (peinf%inode.eq.0) call timacc(20,2)
              call logit('Calling mtxel_ch')
              if (peinf%inode.eq.0) call timacc(16,1)
! JRD Everyone does this now YYYY
              if (mod(peinf%inode,peinf%npes/peinf%npools).eq.0) then
                call mtxel_ch(1,2,gvec,wfnkoff,ncoulch,isrtrq,aqsch,1,kp)
              endif
#ifdef MPI
              call MPI_Bcast(aqsch,ncoulch,MPI_SCALAR,0,peinf%pool_comm,mpierr)
#endif
              if (peinf%inode.eq.0) call timacc(16,2)
              if (mod(peinf%inode,peinf%npes/peinf%npools).eq.0) then
                SAFE_DEALLOCATE_P(wfnkoff%isrtk)
                SAFE_DEALLOCATE_P(wfnkoff%zk)
              endif
              if (nrq.gt.1) aqsaugcho(:,ioff,ispin)=aqsch(:)
            else
              aqsch(:)=aqsaugcho(:,ioff,ispin)
            endif
          endif ! sig%freq_dep.eq.0.or.sig%exact_ch.eq.1

!---------------------
! Compute offdiag SX and CH

          if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
            call logit('Calling mtxel_sxch for offdiagonal matrix elements')
            if (peinf%inode.eq.0) call timacc(11,1)
          
            call mtxel_sxch(sig%offmap(peinf%index_offdiag(ioff),1), &
             sig%offmap(peinf%index_offdiag(ioff),3), &
             ispin,ncouls,neps,gvec,eps,ph,ind,indinv,isrtrqi, &
             isrtrq,vcoul,crys,sig,wpg,wfnk,wfnkq,ncoulch, &
             aqsaug(:,:,sig%offmap(peinf%index_offdiag(ioff),1),ispin), &
             aqsaug(:,:,sig%offmap(peinf%index_offdiag(ioff),2),ispin), &
             aqsch,asigt_imag,acht_n1,asxt,acht,achtcor,kp%nspin, &
             rq(:,irq),coulfact, &
             inv_igp_index,ngpown, &
             epsR,epsA,achtD_n1,asxtDyn,achtDyn,achtDyn_cor,achtDyn_corb,ach2tDyn,2)

            if (peinf%inode.eq.0) call timacc(11,2)
          endif
          
          if (bExactlyZero) cycle
         
          if (peinf%flag_offdiag(ioff)) then
            achcor(peinf%index_offdiag(ioff)+sig%ndiag,ispin)= &
              achcor(peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*achtcor
            asig_imag(peinf%index_offdiag(ioff)+sig%ndiag,ispin)= &
              asig_imag(peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*asigt_imag
            if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
              do iw=1,nfreqgpp
                asx(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)= &
                  asx(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*asxt(iw)
                ach(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)= &
                  ach(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*acht(iw)
              enddo
              do ib=1,sig%ntband
                ach_n1q(ib,peinf%index_offdiag(ioff)+sig%ndiag,ispin) = &
                  ach_n1q(ib,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+neq(irq)*acht_n1(ib)
              enddo
            endif

            if (sig%freq_dep.eq.2) then
              do iw=1,sig%nfreqeval
                asxDyn(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin) = &
                  asxDyn(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*asxtDyn(iw)
                achDyn(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin) = &
                  achDyn(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*achtDyn(iw)
                achDyn_cor(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin) = &
                  achDyn_cor(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*achtDyn_cor(iw)
                achDyn_corb(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin) = &
                  achDyn_corb(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*achtDyn_corb(iw)
                ach2Dyn(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin) = &
                  ach2Dyn(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*ach2tDyn(iw)
              enddo
              do ib=1,sig%ntband
                achD_n1q(ib,peinf%index_offdiag(ioff)+sig%ndiag,ispin) = &
                  achD_n1q(ib,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+neq(irq)*achtD_n1(ib)
              enddo
            endif
          endif
          
!---------------------
! Compute offdiag bare exchange (SKIP this computation if you already know it)

          if (peinf%inode.eq.0) call timacc(18,1)
          if ((.not. sig%use_xdat .and. xflag .and. (.not. sig%coul_mod_flag)) .or. (sig%coul_mod_flag &
            .and. (.not.  sig%use_vxc2dat))) then
            axt=0.0d0
! XXX THREAD?
            do n1=1,peinf%nvband_node
              tempval = wfnkq%ekq(peinf%indext(n1),ispin) - sig%efermi
              if (tempval < sig%tol) then
                if(abs(tempval) < sig%tol) then
                  occ=0.5  ! Fermi-Dirac distribution = 1/2 at Fermi level
                else
                  occ=1D0
                endif
                do ig=1,ncoulb
                  axt=axt+aqsaug(ig,n1,sig%offmap(peinf%index_offdiag(ioff),1),ispin) &
                    *MYCONJG(aqsaug(ig,n1,sig%offmap(peinf%index_offdiag(ioff),2),ispin))*occ &
                    *vcoul(ig)
                enddo
              endif
            enddo
            if (peinf%flag_offdiag(ioff)) then
              ax(peinf%index_offdiag(ioff)+sig%ndiag,ispin)= &
                ax(peinf%index_offdiag(ioff)+sig%ndiag,ispin) - neq(irq)*axt*ryd*sig%xfrac
            endif
          endif ! not using x.dat
          if (peinf%inode.eq.0) call timacc(18,2)
          
        enddo ! ioff (loop over bands for which we need offdiag Sigma)
        
        if (ispin.eq.sig%nspin) then
          SAFE_DEALLOCATE(vcoul)
          if (sig%noffdiag.gt.0) then
            SAFE_DEALLOCATE(aqsaug)
          end if
          if ((sig%freq_dep.eq.0.or.sig%exact_ch.eq.1).and.irq_==nrq) then
            SAFE_DEALLOCATE_P(aqsch)
            if (nrq.gt.1.and.sig%noffdiag.gt.0) then
              SAFE_DEALLOCATE_P(aqsaugcho)
            end if
          endif
        endif
          
!!-------- End offdiag band loop -----------------------------------------------


      enddo ! ispin

!!-------- End spin loop -------------------------------------------------------

!---------------------
! Add up contributions from all PEs

      if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
        SAFE_ALLOCATE(xdum1, ((3+2*nfreqgpp)*(sig%ndiag+sig%noffdiag),sig%nspin))
        do ispin=1,sig%nspin
          do in=1,sig%ndiag+sig%noffdiag
            idum = (in - 1) * (3 + 2*nfreqgpp)
            xdum1(idum+1,ispin) = ax(in,ispin)
            xdum1(idum+2:idum+1+nfreqgpp,ispin) = asx(:,in,ispin)
            xdum1(idum+2+nfreqgpp:idum+1+2*nfreqgpp,ispin) = ach(:,in,ispin)
            xdum1(idum+2+2*nfreqgpp,ispin) = SCALARIFY(achcor(in,ispin))
            xdum1(idum+3+2*nfreqgpp,ispin) = SCALARIFY(asig_imag(in,ispin))

          enddo
        enddo
      endif
      if (sig%freq_dep.eq.2) then
        nfold = (5*sig%nfreqeval+2)            
        SAFE_ALLOCATE(xdum1Dyn, (nfold*(sig%ndiag+sig%noffdiag),sig%nspin))
        do ispin=1,sig%nspin
          do in=1,sig%ndiag+sig%noffdiag
            idum = (in - 1) * nfold
            xdum1Dyn(idum+1,ispin) = ax(in,ispin)
            do ifold = 1, sig%nfreqeval
              xdum1Dyn(idum+1+ifold,ispin) = asxDyn(ifold,in,ispin)
            enddo
            do ifold = 1, sig%nfreqeval
              xdum1Dyn(idum+1+sig%nfreqeval+ifold,ispin) = achDyn(ifold,in,ispin)
            enddo
            do ifold = 1, sig%nfreqeval
              xdum1Dyn(idum+1+2*sig%nfreqeval+ifold,ispin) = achDyn_cor(ifold,in,ispin)
            enddo
            do ifold = 1, sig%nfreqeval
              xdum1Dyn(idum+1+3*sig%nfreqeval+ifold,ispin) = achDyn_corb(ifold,in,ispin)
            enddo
            do ifold = 1, sig%nfreqeval
              xdum1Dyn(idum+1+4*sig%nfreqeval+ifold,ispin) = ach2Dyn(ifold,in,ispin)
            enddo
            xdum1Dyn(idum+5*sig%nfreqeval+2,ispin) = achcor(in,ispin)
          enddo
        enddo
      endif
      
#ifdef MPI
      call logit('Calling MPI_Allreduce xdum1 to xdum2')
#endif
      if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
        SAFE_ALLOCATE(xdum2, ((3+2*nfreqgpp)*(sig%ndiag+sig%noffdiag),sig%nspin))
        ndum=(sig%ndiag+sig%noffdiag)*(3+2*nfreqgpp)*sig%nspin
#ifdef MPI
        call MPI_Allreduce(xdum1(1,1),xdum2(1,1),ndum,MPI_SCALAR,MPI_SUM,MPI_COMM_WORLD,mpierr)
#else
        xdum2=xdum1
#endif
        SAFE_DEALLOCATE(xdum1)
      endif
      if (sig%freq_dep.eq.2) then
        ndum=(sig%ndiag+sig%noffdiag)*nfold*sig%nspin
        SAFE_ALLOCATE(xdum2Dyn, (nfold*(sig%ndiag+sig%noffdiag),sig%nspin))
#ifdef MPI
        call MPI_Allreduce(xdum1Dyn(1,1),xdum2Dyn(1,1),ndum,MPI_COMPLEX_DPC,MPI_SUM,MPI_COMM_WORLD,mpierr)
#else
        xdum2Dyn=xdum1Dyn
#endif
        SAFE_DEALLOCATE(xdum1Dyn)
      endif
      call logit('Done reduction of xdum1 to xdum2')
      
      if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
        do ispin=1,sig%nspin
          do in=1,sig%ndiag+sig%noffdiag
            idum = (in - 1) * (3 + 2*nfreqgpp)
            axbis(in,ispin) = axbis(in,ispin) + xdum2(idum+1,ispin)
            asxbis(:,in,ispin) = asxbis(:,in,ispin) + xdum2(idum+2:idum+1+nfreqgpp,ispin)
            achbis(:,in,ispin) = achbis(:,in,ispin) + xdum2(idum+2+nfreqgpp:idum+1+2*nfreqgpp,ispin)
            achcorbis(in,ispin) = achcorbis(in,ispin) + xdum2(idum+2+2*nfreqgpp,ispin)
            asig_imagbis(in,ispin) = asig_imagbis(in,ispin) + xdum2(idum+3+2*nfreqgpp,ispin)
            if(.not. sig%use_xdat) ax(in,ispin) = ZERO
            asx(:,in,ispin) = ZERO
            ach(:,in,ispin) = ZERO
            achcor(in,ispin) = (0.0d0,0.0d0)
            asig_imag(in,ispin) = (0.0d0,0.0d0)
          enddo
        enddo
        SAFE_DEALLOCATE(xdum2)
      endif
      if (sig%freq_dep.eq.2) then
        do ispin=1,sig%nspin
          do in=1,sig%ndiag+sig%noffdiag
            idum = (in - 1) * nfold
            axbis(in,ispin) = axbis(in,ispin) + xdum2Dyn(idum+1,ispin)
            do ifold = 1, sig%nfreqeval
              asxDbis(ifold,in,ispin) = asxDbis(ifold,in,ispin) + xdum2Dyn(idum+1+ifold,ispin)
            enddo
            do ifold = 1, sig%nfreqeval
              achDbis(ifold,in,ispin) = achDbis(ifold,in,ispin) + xdum2Dyn(idum+1+sig%nfreqeval+ifold,ispin)
            enddo
            do ifold = 1, sig%nfreqeval
              achDbis_cor(ifold,in,ispin) = achDbis_cor(ifold,in,ispin) + xdum2Dyn(idum+1+2*sig%nfreqeval+ifold,ispin)
            enddo
            do ifold = 1, sig%nfreqeval
              achDbis_corb(ifold,in,ispin) = achDbis_corb(ifold,in,ispin) + xdum2Dyn(idum+1+3*sig%nfreqeval+ifold,ispin)
            enddo
            do ifold = 1, sig%nfreqeval
              ach2Dbis(ifold,in,ispin) = ach2Dbis(ifold,in,ispin) + &
                xdum2Dyn(idum+1+4*sig%nfreqeval+ifold,ispin)
            enddo
            achcorbis(in,ispin) = achcorbis(in,ispin) + xdum2Dyn(idum+5*sig%nfreqeval+2,ispin)
            if(.not. sig%use_xdat) ax(in,ispin) = ZERO
            asxDyn(:,in,ispin) = (0.0d0,0.0d0)
            achDyn(:,in,ispin) = (0.0d0,0.0d0)
            achDyn_cor(:,in,ispin) = (0.0d0,0.0d0)
            achDyn_corb(:,in,ispin) = (0.0d0,0.0d0)
            ach2Dyn(:,in,ispin) = (0.0d0,0.0d0)
            achcor(in,ispin) = (0.0d0, 0.0d0)
          enddo
        enddo
        SAFE_DEALLOCATE(xdum2Dyn)
      endif
      
      in = sig%ntband * (sig%ndiag + sig%noffdiag) * sig%nspin
      if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
        SAFE_ALLOCATE(xdum3, (sig%ntband,sig%ndiag+sig%noffdiag,sig%nspin))
        xdum3 = ach_n1q
#ifdef MPI
        call MPI_Allreduce(xdum3(1,1,1),ach_n1q(1,1,1),in,MPI_SCALAR,MPI_SUM,MPI_COMM_WORLD,mpierr)
#endif
        SAFE_DEALLOCATE(xdum3)
        do ispin = 1, sig%nspin
          do in = 1, sig%ndiag + sig%noffdiag
            do ib = 1, sig%ntband
              ach_n1(ib,in,ispin) = ach_n1(ib,in,ispin) + ach_n1q(ib,in,ispin)
              ach_n1q(ib,in,ispin) = 0.0d0
            enddo
          enddo
        enddo
      endif
      if (sig%freq_dep.eq.2) then
        SAFE_ALLOCATE(xdum3Dyn, (sig%ntband,sig%ndiag+sig%noffdiag,sig%nspin))
        xdum3Dyn = achD_n1q
#ifdef MPI
        call MPI_Allreduce(xdum3Dyn(1,1,1),achD_n1q(1,1,1),in,MPI_COMPLEX_DPC,MPI_SUM,MPI_COMM_WORLD,mpierr)
#endif
        SAFE_DEALLOCATE(xdum3Dyn)
        do ispin = 1, sig%nspin
          do in = 1, sig%ndiag + sig%noffdiag
            do ib = 1, sig%ntband
              achD_n1(ib,in,ispin) = achD_n1(ib,in,ispin) + achD_n1q(ib,in,ispin)
              achD_n1q(ib,in,ispin) = 0.0d0
            enddo
          enddo
        enddo
      endif
    enddo ! irq (loop over rq point in BZ summation)
    call progress_free(prog_info)
    
!!-------- End loop over k-points in irr bz with respect to kn (rq) ------------


    if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
      do ispin=1,sig%nspin
        do in=1,sig%ndiag+sig%noffdiag
          if(.not. sig%use_xdat) ax(in,ispin) = axbis(in,ispin)
          asx(:,in,ispin) = asxbis(:,in,ispin)
          ach(:,in,ispin) = achbis(:,in,ispin)
          achcor(in,ispin) = achcorbis(in,ispin)
          asig_imag(in,ispin) = asig_imagbis(in,ispin)
        enddo
      enddo
    endif
    if (sig%freq_dep.eq.2) then
      do ispin=1,sig%nspin
        do in=1,sig%ndiag+sig%noffdiag
          if(.not. sig%use_xdat) ax(in,ispin) = axbis(in,ispin)
          asxDyn(:,in,ispin) = asxDbis(:,in,ispin)
          achDyn(:,in,ispin) = achDbis(:,in,ispin)
          achDyn_cor(:,in,ispin) = achDbis_cor(:,in,ispin)
          achDyn_corb(:,in,ispin) = achDbis_corb(:,in,ispin)
          ach2Dyn(:,in,ispin) = ach2Dbis(:,in,ispin)
          achcor(in,ispin) = achcorbis(in,ispin)
        enddo
      enddo
    endif
    
!----------------------------
! Output unsymmetrized values of X,SX,CH
! Symmetrize X,SX,CH matrix elements over degenerate states
! Convert to eV and output symmetrized values of X,SX,CH

    if (peinf%inode.eq.0) then

      ! This beautiful piece of code should never be executed and here only because it somehow
      ! prevents gfortran from an incorrect optimization of this routine that will produce NaNs.
      ! It has some mysterious relation to the ijk loop above reading from iunit_eps.
      ! Insanely, the presence of function 'isNaN' appears to be crucial here. --DAS
#ifdef GNU
      if(ikn == -1) then
        call die("BUG: ikn = -1")
        write(0,*) isNaN(fact)
      endif
#endif

      do ispin=1,sig%nspin
        write(6,989)ikn,sig%spin_index(ispin)
        if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
          write(6,988)
          do i=1,sig%ndiag
            write(6,987)sig%diag(i),wfnk%elda(sig%diag(i),ispin), &
              wfnk%ek(sig%diag(i),ispin),dble(ax(i,ispin))*ryd, &
              dble(asx(2,i,ispin))*ryd,dble(ach(2,i,ispin)+achcor(i,ispin))*ryd, &
              dble(ax(i,ispin)+asx(2,i,ispin)+ach(2,i,ispin)+achcor(i,ispin))*ryd, &
              dble(alda(i,ispin))*ryd
          enddo
        endif
          
        if (sig%freq_dep.eq.2) then
          write(6,888)
          do i=1,sig%ndiag
            
! JRD: Find iw closest to e_lk

            diffmin = INF
            e_lk = wfnk%ek(sig%diag(i),ispin)
            do iw=1,sig%nfreqeval
              diff = abs(sig%freqevalmin + (iw-1)*sig%freqevalstep - e_lk)
              if (diff .lt. diffmin) then
                diffmin=diff
                iwlda=iw
              endif
            enddo
            
            write(6,887)sig%diag(i),wfnk%elda(sig%diag(i),ispin), &
              wfnk%ek(sig%diag(i),ispin),dble(ax(i,ispin))*ryd, &
              dble(asxDyn(iwlda,i,ispin))*ryd,dble(achDyn(iwlda,i,ispin)+achcor(i,ispin))*ryd, &
              dble(ax(i,ispin)+asxDyn(iwlda,i,ispin)+achDyn(iwlda,i,ispin)+achcor(i,ispin))*ryd, &
              dble(alda(i,ispin))*ryd, &
              IMAG(asxDyn(iwlda,i,ispin))*ryd,IMAG(achDyn(iwlda,i,ispin))*ryd, &
              IMAG(ax(i,ispin)+asxDyn(iwlda,i,ispin)+achDyn(iwlda,i,ispin))*ryd, &
              IMAG(ax(i,ispin)+asxDyn(iwlda,i,ispin)+ach2Dyn(iwlda,i,ispin))*ryd
          enddo
        endif
      enddo
    endif ! node 0
989 format(1x,"Unsymmetrized values for ik =",i4,1x,"spin =",i2)
988 format(/,3x,"n",5x,"elda",5x,"ecor",8x,"x",5x,"sx-x",7x,"ch",6x,"sig",6x,"vxc")
987 format(i4,7f9.3)
888 format(/,3x,"n",4x,"elda",5x,"ecor",8x,"x",2x,"re sx-x",4x,"re ch",3x,"re sig",6x,"vxc",/,&
      33x,"im sx-x",4x,"im ch",3x,"im sig",2x,"im sig2")
887 format(i4,7f9.3,/,28x,4f9.3)
      
!----------------------------
! Symmetrize matrix elements

    if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
      call shiftenergy(sig,wfnk,alda,asx,ach,achcor,ax,efsto,asig,enew,zrenorm,nfreqgpp)
    endif
    if (sig%freq_dep.eq.2) then
      call shiftenergy_dyn(sig,wfnk,alda,asxDyn,achDyn,achDyn_cor,achDyn_corb,ach2Dyn,achcor,ax,efstoDyn,asigDyn,ikn,kp)
    endif
    
!----------------------------
! Write out matrix elements

    if (peinf%inode.eq.0) then
      write(6,'(a)')
      write(6,'(a)') ' Symmetrized values from band-averaging:'

      if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
        call write_result(kp,wfnk,sig,ach_n1,ax,asx,ach,achcor,asig,alda,efsto,enew,zrenorm,ikn)
        call write_result_hp(kp,wfnk,sig,ax,asx,ach,achcor,asig,alda,efsto,enew,zrenorm,ikn)
        if (sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
          do ispin=1,sig%nspin
            do i=1,sig%ndiag
              if (abs(efsto(i,ispin)+SCALARIFY(achcor(i,ispin)) - wfnk%ek(sig%diag(i),ispin)).gt.sig%dw) then
                eqp1flag=.true.
              endif
            enddo
          enddo
        endif
      endif
      
      if (sig%freq_dep.eq.2) then
        call write_result_dyn(kp,wfnk,sig,achD_n1,ax,asxDyn,achDyn,achDyn_corb,achcor,asigDyn,alda,efstoDyn,ikn)
        call write_result_dyn_hp(kp,wfnk,sig,ax,asxDyn,achDyn,achDyn_corb,achcor,asigDyn,alda,efstoDyn,ikn)
      endif
    endif

    if(peinf%inode == 0) then

!----------------------------
! If not using vxc.dat, create and write Vxc in it

      call timacc(5,1)
      if(.not. sig%use_vxcdat) then
        call write_matrix_elements_type(120, kp%rk(:, ikn), sig, COMPLEXIFY(alda(:,:)))
      endif
      call timacc(5,2)
      
!----------------------------
! If not using x.dat, create and write X in it

      call timacc(18,1)
      if (sig%coul_mod_flag .and. (.not. sig%use_vxc2dat)) then
        call write_matrix_elements_type(121, kp%rk(:, ikn), sig, COMPLEXIFY(ax(:,:)))
      else if((.not. sig%use_xdat) .and. xflag .and. (.not. sig%coul_mod_flag)) then
        ax(:,:) = ax(:,:) / sig%xfrac
        call write_matrix_elements_type(119, kp%rk(:, ikn), sig, COMPLEXIFY(ax(:,:)))
      endif
      call timacc(18,2)
      
    endif
    
    SAFE_DEALLOCATE(indrq)
    SAFE_DEALLOCATE(neq)
    SAFE_DEALLOCATE(itnrq)
    SAFE_DEALLOCATE(rq)
    SAFE_DEALLOCATE(kg0)
    SAFE_DEALLOCATE_P(wfnk%zk)
    
  enddo ! ika (loop over k-points sig%nkn)

  call dealloc_grid(gr)
  call destroy_qran() ! from vcoul_generator

!--------- End loop over rkn points (for which we calculate sigma) -------------

  
  if (peinf%inode.eq.0) then
    if (sig%freq_dep.eq.2) then
      write(6,778)
      write(7,778)
      write(8,778)
    endif
    write(6,777)
    write(7,777)
    write(8,777)
    if (sig%noffdiag.gt.0) then
      write(6,776)
      write(7,776)
      write(8,776)
    endif
  endif
778 format(/,1x,"eqp0 eigenvalue at nearest energy to quasiparticle eigenvalue", &
      /,1x,"for eqp0(omega) look at spectrum.dat")
777 format( &
      /,4x,"n = band index", &
      /,1x,"elda = energy eigenvalue", &
      /,1x,"ecor = corrected energy eigenvalue", &
      /,4x,"x = bare exchange", &
      /,3x,"sx = screened exchange at energy ecor", &
      /,3x,"ch = coulomb hole at energy ecor", &
      /,2x,"sig = sx + ch = self-energy at energy ecor", &
      /,2x,"vxc = exchange-correlation potential", &
      /,1x,"eqp0 = elda - vxc + sig(ecor)", &
      /,1x,"eqp1 = eqp0 + (dsig/de) / (1 - dsig/de) * (eqp0 - ecor)", &
      /,2x,"Znk = quasiparticle renormalization factor",/, &
      /,8x,"finite_difference_form from sigma.inp file:", &
      /,8x,"none     = -2 : dsig/de = 0 [skip the expansion]", &
      /,8x,"backward = -1 : dsig/de = (sig(ecor) - sig(ecor-de)) / de", &
      /,8x,"central  =  0 : dsig/de = (sig(ecor+de) - sig(ecor-de)) / (2*de)", &
      /,8x,"forward  =  1 : dsig/de = (sig(ecor+de) - sig(ecor)) / de", &
      /,8x,"default  =  2 : forward for diagonal and none for off-diagonal", &
      /,8x,"de is finite_difference_spacing from sigma.inp file", &
      /,8x,"elda,ecor,x,sx,ch,sig,vxc,eqp0,eqp,de are in eV", &
      /,8x,"elda and vxc both contain vxc0 so it cancels out", &
      /,8x,"eqp1 and eqp0 are Eqs. (36-37) from Hybertsen & Louie PRB 34 5390",/)
776 format( &
      4x,"n = band index of bra wavefunction", &
      /,4x,"m = band index of ket wavefunction", &
      /,4x,"l = band index of energy eigenvalue",/, &
      /,1x,"< psi_n(k) |       x       | psi_m(k) >" &
      /,1x,"< psi_n(k) | sx(ecor_l(k)) | psi_m(k) >" &
      /,1x,"< psi_n(k) | ch(ecor_l(k)) | psi_m(k) >" &
      /,1x,"< psi_n(k) |      vxc      | psi_m(k) >",/)
  
  if (peinf%inode.eq.0) then
    if (eqp1flag) then
      write(0,666)
    endif
  endif
666 format( &
      1x,"WARNING: |eqp0 - ecor| > finite_difference_spacing",/, &
      3x,"Linear extrapolation for eqp1 may be inaccurate.",/, &
      3x,"You should test the validity of eqp1 by rerunning",/, &
      3x,"calculation with self-energy evaluated at the eqp0",/, &
      3x,"energies. For that, use the eqp_outer.dat file",/, &
      3x,"created with eqp.py script and point WFN_outer to WFN_inner",/, &
      3x,"if you were not already using WFN_outer.",/)
  
  if (peinf%inode.eq.0) then
    if (imagvxcflag) write(0,677) 'Vxc'
    if (imagxflag) write(0,677) 'exchange'
  endif
677 format( &
      1x,"WARNING: ",a," diagonal matrix elements have large imaginary part.",/, &
      3x,"This may indicate a problem with your calculation.",/)
  
    
!--------- Clean Up and Finish -------------------------------------------------

  call destroy_fftw_plans()

  if (sig%spl_tck%n>0) then
    SAFE_DEALLOCATE_P(sig%spl_tck%t)
    SAFE_DEALLOCATE_P(sig%spl_tck%c)
  endif
  if (sig%spl_tck_outer%n>0) then
    SAFE_DEALLOCATE_P(sig%spl_tck_outer%t)
    SAFE_DEALLOCATE_P(sig%spl_tck_outer%c)
  endif

  SAFE_DEALLOCATE_P(kp%rk)
  SAFE_DEALLOCATE_P(wfnk%isrtk)
  SAFE_DEALLOCATE_P(wfnk%ek)
  SAFE_DEALLOCATE_P(wfnk%elda)
  SAFE_DEALLOCATE_P(sig%qpt)
  if (sig%ndiag.ne.0) then
    SAFE_DEALLOCATE_P(sig%diag)
  end if
  if (sig%noffdiag.gt.0) then
    SAFE_DEALLOCATE_P(sig%off1)
    SAFE_DEALLOCATE_P(sig%off2)
    SAFE_DEALLOCATE_P(sig%off3)
    SAFE_DEALLOCATE_P(sig%offmap)
  endif
  SAFE_DEALLOCATE_P(sig%indkn)
  if(.not. sig%use_vxcdat) then
    SAFE_DEALLOCATE_P(sig%vxc)
  end if
  SAFE_DEALLOCATE_P(gvec%components)
  SAFE_DEALLOCATE_P(gvec%index_vec)
  if(sig%freq_dep.eq.1) then
    SAFE_DEALLOCATE_P(wpg%rho)
  endif
  SAFE_DEALLOCATE_P(wfnkq%isrtkq)
  SAFE_DEALLOCATE_P(wfnkq%ekq)
  SAFE_DEALLOCATE_P(peinf%indext)
  SAFE_DEALLOCATE_P(peinf%indext_dist)
  SAFE_DEALLOCATE_P(peinf%ntband_dist)
  SAFE_DEALLOCATE(alda)
  SAFE_DEALLOCATE(ax)
  SAFE_DEALLOCATE(axbis)
  SAFE_DEALLOCATE(achcor)
  SAFE_DEALLOCATE(achcorbis)
  SAFE_DEALLOCATE(asig_imag)
  if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
    SAFE_DEALLOCATE(asx)
    SAFE_DEALLOCATE(ach)
    SAFE_DEALLOCATE(asig)
    SAFE_DEALLOCATE_P(acht_n1)
    SAFE_DEALLOCATE(ach_n1q)
    SAFE_DEALLOCATE(ach_n1)
    SAFE_DEALLOCATE(asxbis)
    SAFE_DEALLOCATE(achbis)
    SAFE_DEALLOCATE(enew)
    SAFE_DEALLOCATE(efsto)
  endif
  if (sig%freq_dep.eq.2) then
    SAFE_DEALLOCATE(asxDyn)
    SAFE_DEALLOCATE(achDyn)
    SAFE_DEALLOCATE(achDyn_cor)
    SAFE_DEALLOCATE(achDyn_corb)
    SAFE_DEALLOCATE(ach2Dyn)
    SAFE_DEALLOCATE(asigDyn)
    SAFE_DEALLOCATE_P(achtD_n1)
    SAFE_DEALLOCATE(achD_n1q)
    SAFE_DEALLOCATE(achD_n1)
    SAFE_DEALLOCATE(asxDbis)
    SAFE_DEALLOCATE(achDbis)
    SAFE_DEALLOCATE(achDbis_cor)
    SAFE_DEALLOCATE(achDbis_corb)
    SAFE_DEALLOCATE(ach2Dbis)
    SAFE_DEALLOCATE(efstoDyn)
    SAFE_DEALLOCATE_P(asxtDyn)
    SAFE_DEALLOCATE_P(achtDyn)
    SAFE_DEALLOCATE_P(achtDyn_cor)
    SAFE_DEALLOCATE_P(achtDyn_corb)
    SAFE_DEALLOCATE_P(ach2tDyn)
  endif
  if (sig%freq_dep.eq.1.or.sig%freq_dep.eq.3) then
    SAFE_DEALLOCATE_P(acht)
    SAFE_DEALLOCATE_P(asxt)
  endif
  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
    SAFE_DEALLOCATE_P(eps)
    if (sig%iwriteint.eq.0) then
      SAFE_DEALLOCATE(epstemp)
    end if
  endif
  if (sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
    SAFE_DEALLOCATE_P(epsR)
    SAFE_DEALLOCATE(epsRtemp)
#ifdef CPLX
    SAFE_DEALLOCATE_P(epsA)
    SAFE_DEALLOCATE(epsAtemp)
#endif
  endif
  SAFE_DEALLOCATE(isrtrq)
  if (sig%freq_dep.eq.0.or.sig%exact_ch.eq.1) then
    SAFE_DEALLOCATE_P(isrtrqi)
  endif
  SAFE_DEALLOCATE(ekin)
  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
    SAFE_DEALLOCATE(isrtq)
    SAFE_DEALLOCATE(isrtqi)
    SAFE_DEALLOCATE(ind)
    SAFE_DEALLOCATE(indinv)
    SAFE_DEALLOCATE(ph)
    if(sig%iwriteint.eq.1) then
      if (peinf%inode .eq. 0) then
        SAFE_DEALLOCATE_P(epsmpi%isrtq)
        SAFE_DEALLOCATE_P(epsmpi%isrtqi)
      endif
      SAFE_DEALLOCATE_P(epsmpi%qk)
      SAFE_DEALLOCATE_P(epsmpi%nmtx)
      SAFE_DEALLOCATE_P(epsmpi%igp_owner)
      SAFE_DEALLOCATE_P(epsmpi%igp_index)
      SAFE_DEALLOCATE_P(epsmpi%inv_igp_index)
      if(sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
        SAFE_DEALLOCATE_P(epsmpi%eps)
      else
        SAFE_DEALLOCATE_P(epsmpi%epsR)
#ifdef CPLX
        SAFE_DEALLOCATE_P(epsmpi%epsA)
#endif
      endif
    endif
  endif
  if (sig%iwriteint.eq.1) then
    SAFE_DEALLOCATE_P(wfnkqmpi%nkptotal)
    SAFE_DEALLOCATE_P(wfnkqmpi%isort)
    SAFE_DEALLOCATE_P(wfnkqmpi%el)
    SAFE_DEALLOCATE_P(wfnkqmpi%qk)
    SAFE_DEALLOCATE_P(wfnkqmpi%band_index)
    SAFE_DEALLOCATE_P(wfnkqmpi%cg)
    if(sig%nkn.gt.1) then
      SAFE_DEALLOCATE_P(wfnkmpi%nkptotal)
      SAFE_DEALLOCATE_P(wfnkmpi%isort)
      SAFE_DEALLOCATE_P(wfnkmpi%qk)
      SAFE_DEALLOCATE_P(wfnkmpi%el)
      SAFE_DEALLOCATE_P(wfnkmpi%elda)
      SAFE_DEALLOCATE_P(wfnkmpi%cg)
    endif
  endif
  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2.or.sig%freq_dep.eq.3) then
    SAFE_DEALLOCATE(igp_owner)
    SAFE_DEALLOCATE(igp_index)
    SAFE_DEALLOCATE(inv_igp_index)
  endif
    
!----------------------------
! Time Accounting
    
  routnam(1)='TOTAL:'
  routnam(2)='INPUT:'
  routnam(3)='EPSCOPY:'
  routnam(4)='FULLBZ:'
  routnam(5)='VXC:'
  routnam(6)='SUBGRP:'
  routnam(7)='IRRBZ:'
  routnam(8)='GMAP:'
  routnam(9)='GENWF:'
  routnam(10)='MTXEL:'
  routnam(11)='SXCH TOT.:'
!  routnam(12)='SX CH (WPEFF):'
  routnam(13)='VCOUL:'
  routnam(14)='EPSREAD:'
  routnam(15)='INPUT_OUTER:'
  routnam(16)='MTXEL_CH:'
  routnam(17)='MTXEL COMM:'
  routnam(18)='BARE X:'
  routnam(19)='WF COMM:'
  routnam(20)='WF_CH COMM:'
  routnam(21)='INPUT (READ):'
  routnam(22)='INPUT (WRITE):'
  routnam(41)='SXCH INIT:'
  routnam(42)='SXCH EPS INIT:'
  routnam(43)='SXCH COMM:'
  routnam(44)='SXCH EX. STAT:'
  routnam(45)='SXCH WPLASMA:'
  routnam(46)='SXCH PP PARS:'
  routnam(49)='SXCH FF CH:'
  routnam(50)='SXCH FF CHX:'
  routnam(51)='SXCH REMAIN:'
  routnam(52)='SXCH PP SUM:'
  routnam(53)='SXCH FF SX:'
  routnam(59)='READ NEPS:'
  routnam(61)='EPSCOPY IO:'
  routnam(62)='EPSCOPY COMM:'
  routnam(81)='INPUT I/O'
  routnam(82)='INPUT COMM'
  routnam(91)='FFT ZERO'
  routnam(92)='FFT PUT'
  routnam(93)='FFT PLAN'
  routnam(94)='FFT EXEC'
  routnam(95)='FFT MLTPLY'
  
  routsrt=(/ 1, 4, 7, 6, 8, 2,81,82,21,22,15, 9, 19,20,3,61,62,59,14,10,16, &
             17,13,5,18,11,41,42,43,44,51,45,46,52,53,49,50,91,92,93,94,95/)
  
  if(peinf%inode.eq.0) then
    call timacc(1,2)
    write(6,9000)
    do i=2,ubound(routsrt, 1)
      call timacc(routsrt(i),3,tsec,ncount)
      write(6,9001) routnam(routsrt(i)),tsec(1),tsec(2),ncount
    enddo
    write(6,*)
    call timacc(routsrt(1),3,tsec,ncount)
    write(6,9002) routnam(routsrt(1)),tsec(1),tsec(2)
    write(6,*)
9000 format(/,22x,"CPU [s]",8x,"WALL [s]",11x,"#",/)
9001 format(a16,f13.3,3x,f13.3,3x,i9)
9002 format(a16,f13.3,3x,f13.3)
  endif
  
!----------------------------
! Close files and finish

  call close_file(55) ! file sigma.inp
  if(peinf%inode == 0) then
    call close_file(7) ! file sigma.log
    call close_file(8) ! file sigma_hp.log
    if (sig%coul_mod_flag .and. (.not. sig%use_vxc2dat)) then
      call close_file(121) ! file vxc2.dat
    elseif ((.not. sig%use_xdat) .and. xflag .and. (.not. sig%coul_mod_flag)) then
      call close_file(119) ! file x.dat
    endif
    if(.not. sig%use_vxcdat) then
      call close_file(120) ! file vxc.dat
    end if
    if (.not.(sig%freq_dep .eq. 0 .and. sig%exact_ch .eq. 1)  .and. .not. (sig%freq_dep == -1)) then
      call close_file(127) ! file ch_converge.dat
    endif
    if (sig%iwritecoul .eq. 1) then
      call close_file(19) ! file vcoul
    endif
  endif  
  if (peinf%inode.eq.0 .and. (sig%freq_dep.eq.2 .or. (sig%fdf.eq.-3 .and. sig%freq_dep.eq.1))) then
    write(8000, '(/,a)')'# Please refer to Sigma/README for more information about this file.'
    call close_file(8000) ! file spectrum.dat
  endif
  if (sig%iwriteint .eq. 0) then
    if (peinf%inode.eq.0 .and. sig%freq_dep >= 0) then
      call open_file(iunit_eps, file=fne, status='old')
      call close_file(iunit_eps, delete = .true.)
    endif
    call open_file(iunit_c, file=fnc, status='old')
    call close_file(iunit_c, delete = .true.) ! files INT_CWFN_*
    if (mod(peinf%inode,peinf%npes/peinf%npools).eq.0) then
      call open_file(iunit_k, file=fnk, status='old')
      call close_file(iunit_k, delete = .true.) ! files INT_WFNK_*
    endif
  endif
  
  call write_memory_usage()

#ifdef HDF5

  if(sig%use_hdf5) call h5close_f(error)

#endif

#ifdef MPI
  call MPI_Finalize(mpierr)
#endif

end program sigma
