!===============================================================================
!
! Routines:
!
! (1) sigma              Originally By MSH        Last Modified 10/5/2009 (gsm)
!
! This is the main routine for the Sigma code.  Please see the documentation
! in the README for more information on this code.
!
!===============================================================================

#include "f_defs.h"

program sigma

  use global_m
  use fixwings_m
  use fftw_m
  use fullbz_m
  use gmap_m
  use irrbz_m
  use misc_m
  use mtxel_sxch_m
  use sort_m
  use vcoul_generator_m
  use wfn_rho_vxc_io_m
  implicit none

!---------------------
! Derived Types

  type (crystal) :: crys
  type (symmetry) :: syms
  type (gspace) :: gvec
  type (kpoints) :: kp
  type (siginfo) :: sig
  type (wpgen) :: wpg
  type (wfnkstates) :: wfnk,wfnkoff
  type (wfnkqstates) :: wfnkq
  type (epsmpiinfo) :: epsmpi
  type (wfnkqmpiinfo) :: wfnkqmpi
  type (wfnkmpiinfo) :: wfnkmpi
  type (twork_scell) :: work_scell

!--------------------
! Variables for k-points

!--------------------
! k-points for which we know epsilon (q-points)

  integer :: nq, iparallel
  real(DP), allocatable :: qq(:,:)

!---------------------
! k-points for the sum over BZ (the rq-points)

  integer :: nm,nrq,ijk,iout
  integer, allocatable :: neq(:),indrq(:),itnrq(:),kg0(:,:)
  real(DP), allocatable :: rq(:,:)
  type(grid) :: gr
  
!---------------------
! (k-q) kpoints involved in sigma summations
! ===> see data type cwfnkq and vwfnkq

  real(DP) :: rkq(3)

!---------------------
! Dielectric matrices

  integer :: ngq,neps,nmtx,ncoul,ncoulch,ncoulb,ncouls
  integer :: ngqt,nmtxt
  integer, allocatable :: isrtq(:),isrtqi(:)
  integer, allocatable :: isrtrq(:),isrtrqi(:)
  SCALAR, allocatable :: eps(:,:)
  real(DP), allocatable :: vcoul(:)
  complex(DPC), allocatable :: epsR(:,:,:),epsA(:,:,:)

!---------------------
! Matrix elements for sigma

  SCALAR, allocatable :: aqs(:,:),aqsaug(:,:,:,:),aqsch(:), &
    aqsaugchd(:,:,:),aqsaugcho(:,:,:),alda(:,:),ax(:,:),axbis(:,:), &
    asx(:,:,:),ach(:,:,:),asig(:,:),acht_n1(:), &
    ach_n1q(:,:,:),ach_n1(:,:,:),asxbis(:,:,:),achbis(:,:,:), &
    xdum1(:,:),xdum2(:,:),xdum3(:,:,:)
  real(DP), allocatable :: enew(:,:),efsto(:,:),zrenorm(:,:)

  complex(DPC), allocatable :: achcor(:,:), achcorbis(:,:)
  complex(DPC), allocatable :: asig_imag(:,:), asig_imagbis(:,:)

  complex(DPC), allocatable :: asxDyn(:,:,:),achDyn(:,:,:), &
    ach2Dyn(:,:,:),asigDyn(:,:),achtD_n1(:),achD_n1q(:,:,:), &
    achD_n1(:,:,:),asxDbis(:,:,:),achDbis(:,:,:),ach2Dbis(:,:,:), &
    xdum1Dyn(:,:),xdum2Dyn(:,:),xdum3Dyn(:,:,:)
  complex(DPC), allocatable :: efstoDyn(:,:)

!----------------------
! eps distrib variables

  integer :: ngown
  integer, allocatable :: igown(:),iggown(:)
  SCALAR, allocatable :: epstemp(:)
  
  complex(DPC), allocatable :: epsRtemp(:,:),epsAtemp(:,:)

  character :: tmpstr*100
  character :: tmpfn*16
  character*20 :: fnc,fnk,fne
  character*16 :: routnam(22)
  integer :: routsrt(21)
  logical :: xflag,eqp1flag,imagvxcflag,imagxflag,found,q0flag,bExactlyZero
  integer :: ig,igad,i,j,k,itran,ikn,ika,ioff
  integer :: in,im,iw,ib,idum,kg(3),icurr,jj,ii,ispin,g1,g2
  integer :: ncount,ndum,nbandi,tag,dest,source,nfold,ifold
  integer :: iwlda,irq,n1,ierr
  integer :: s2,itpc,itpk,itpe,ndv_ikn
  integer, allocatable :: ind(:)
  real(DP) :: fact,coulfact,tempval,occ,edummy
  real(DP) :: qshift(3),oneoverq,qlen,q0len,vq(3),qk(3)
  real(DP) :: tsec(2),diffmin,diff,e_lk
  complex(DPC), allocatable :: achtDyn(:),asxtDyn(:),ach2tDyn(:)
  SCALAR :: asxt(3),acht(3),achtcor,axt,ax_ig,epshead,asigt_imag
  SCALAR, allocatable :: ph(:)

  logical :: skip_checkbz

!--------------- Begin Program -------------------------------------------------

  call peinfo_init()

!----------------------
! Initialize random numbers

  peinf%jobtypeeval = 1

!----------------------
! Initialize wcoul0
  sig%wcoul0 = ZERO

!------------------------
! Initialize timer

  if (peinf%inode.eq.0) then
    call timacc(1,0,tsec)
    call timacc(1,1,tsec)
  endif

!------------------------
! Initialize files

  call open_file(55,file='sigma.inp',form='formatted',status='old')
!      call open_file(6,file='sig.out',form='formatted')
  if(peinf%inode == 0) then
    call open_file(7,file='sigma.log',form='formatted',status='replace')
    call open_file(8,file='sigma_hp.log',form='formatted',status='replace')
  endif

  call write_program_header('Sigma', .true.)

!------------------------
! Initialize temporary files

!  if (sig%iwriteint .eq. 0) then
! We do not yet know, until inread inside input, whether we will use comm_disk
! However, we need to pass fnc and fnk to input. Therefore, we initialize these
! variables in all cases, though we will never use them for comm_mpi. --DAS
  itpc=10000+peinf%inode
  itpk=20000+peinf%inode
  itpe=30000

  if(peinf%inode.lt.10000) then
    write(fnc,'(a,i4.4)') 'INT_CWFN_', peinf%inode
    write(fnk,'(a,i4.4)') 'INT_WFNK_', peinf%inode
    write(fne,'(a)') 'INT_EPS'
  else
    call die('temp file names: too many nodes')
  endif

!------- Read crys data and wavefunctions from WFN_inner ----------------------------

! JRD: Included in input is the inread routine which reads the
! job parameters from sigma.inp and initializes the XC potential

  if (peinf%inode.eq.0) call timacc(2,1,tsec)
  call input(crys,gvec,syms,kp,wpg,sig,wfnk,itpc,itpk,fnc,fnk,wfnkqmpi,wfnkmpi)
  if (peinf%inode.eq.0) call timacc(2,2,tsec)
  if (peinf%inode.eq.0) call timacc(15,1,tsec)
  call input_outer(crys,gvec,syms,kp,sig,wfnk,itpk,fnk,wfnkmpi)
  if (peinf%inode.eq.0) call timacc(15,2,tsec)
  SAFE_DEALLOCATE_P(sig%band_index)
  SAFE_DEALLOCATE_P(sig%kpt)
  SAFE_DEALLOCATE_P(kp%ifmin)
  SAFE_DEALLOCATE_P(kp%ifmax)

!-------------------
! Initialize Various Parameters from inread

! linear extrapolation for eqp1

  eqp1flag=.false.

! imaginary parts of diagonal vxc or exchange matrix elements

  imagvxcflag=.false.
  imagxflag=.false.

! fraction of bare exchange

  if (abs(sig%xfrac).GT.TOL_Small) then
    xflag=.true.
  else
    xflag=.false.
  endif

!---------------------
! Open Various Files 

  if (peinf%inode .eq. 0 .and. .not.(sig%freq_dep .eq. 0 .and. sig%exact_ch .eq. 1) .and. .not. (sig%freq_dep == -1)) then
    call open_file(127,file='ch_converge.dat',form='formatted',status='replace')
  endif

  if (sig%iwritecoul .eq. 1) then
    if (peinf%inode .eq. 0) then
      call open_file(19,file='vcoul',form='formatted',status='replace')
    endif 
  endif

  if ((.not. sig%use_xdat) .and. xflag) then
    call open_file(119,file='x.dat',form='formatted',status='replace')
  endif

  if (.not. sig%use_vxcdat) then
    call open_file(120,file='vxc.dat',form='formatted',status='replace')
  endif

!---------------------
! Write header of sigma_hp.log file

  if (peinf%inode.eq.0) then
    write(8,601) sig%freq_dep
    write(8,602) sig%bmin,sig%bmax
    write(8,603) sig%loff,sig%toff
    write(8,604) sig%fdf
    if(sig%fdf /= -2) write(8,605) sig%dw
    write(8,606) syms%ntran
    do itran=1,syms%ntran
      write(8,607) itran, ((syms%mtrx(i,j,itran),i=1,3),j=1,3)
    enddo
    write(8,*)
  endif
601 format(/,1x,"frequency_dependence",i4)
602 format(/,1x,"band_index",2i6)
603 format(1x,"sigma_matrix",i6,i4)
604 format(/,1x,"finite_difference_form",i4)
605 format(1x,"finite_difference_spacing",f10.6)
606 format(/,1x,"symmetries",/,1x,"ntran  =",i3)
607 format(1x,"mtrx",i2.2,1x,"=",9i3)

!---------------------
! JRD: Initialize the Full Frequency output files

  if (peinf%inode .eq. 0 .and. sig%freq_dep .eq. 2) then
    call open_file(8000,file='spectrum.dat',form='formatted',status='replace')
!        call open_file(8001,file='spectrum.plot',form='formatted')
  endif

!---------------------
! Determine nq and neps
  
  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2) then
    call open_file(unit=10,file='eps0mat',form='unformatted',status='old')
    call open_file(unit=11,file='epsmat',form='unformatted',status='old',iostat=sig%igamma)
    if(peinf%inode.eq.0) then
      read(10)
      read(10)
      read(10)
      read(10)
      read(10)
      read(10)
      read(10)
      read(10)
      read(10)
      read(10) i,neps
      call close_file(10)
      
      call open_file(unit=10,file='eps0mat',form='unformatted',status='old')
      if(sig%igamma.ne.0) then ! Gamma-only calculation
        nq=0
      else
        read(11)
        read(11)
        read(11)
        read(11)
        read(11)
        read(11)
        read(11)
        read(11) nq
        read(11)
        
        do j=1,nq
          read(11) i,nmtx
          read(11)
          if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
            do ijk=1,nmtx
              read(11)
            enddo
          endif
          if (sig%freq_dep.eq.2) then
            do ijk=1,nmtx
              do ii=1,nmtx
                read(11) ! For epsRDyn
              enddo
#ifdef CPLX
              do ii=1,nmtx
                read(11) ! For epsADyn
              enddo
#endif
            enddo
          endif
          read(11)
          if(neps.lt.nmtx) neps=nmtx
        enddo
        call close_file(11)
        call open_file(unit=11,file='epsmat',form='unformatted',status='old',iostat=sig%igamma)
      endif
    endif
#ifdef MPI
    call MPI_Bcast(nq,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
    call MPI_Bcast(neps,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
#endif
  endif
  if (sig%freq_dep.eq.-1) then
    nq=sig%nq-1
    neps=0
  endif

!----------------------------
! Allocate arrays

  SAFE_ALLOCATE(wfnkq%isrtkq, (gvec%ng))
  SAFE_ALLOCATE(wfnkq%ekin, (gvec%ng))
  SAFE_ALLOCATE(wfnkq%ekq, (sig%ntband,kp%nspin))
  SAFE_ALLOCATE(alda, (sig%ndiag+sig%noffdiag,sig%nspin))
  SAFE_ALLOCATE(ax, (sig%ndiag+sig%noffdiag,sig%nspin))
  SAFE_ALLOCATE(axbis, (sig%ndiag+sig%noffdiag,sig%nspin))
  SAFE_ALLOCATE(achcor, (sig%ndiag+sig%noffdiag,sig%nspin))
  SAFE_ALLOCATE(asig_imag, (sig%ndiag+sig%noffdiag,sig%nspin))
  SAFE_ALLOCATE(achcorbis, (sig%ndiag+sig%noffdiag,sig%nspin))
  SAFE_ALLOCATE(asig_imagbis, (sig%ndiag+sig%noffdiag,sig%nspin))
  if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
    SAFE_ALLOCATE(asx, (3,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(ach, (3,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(asig, (sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(acht_n1, (sig%ntband))
    SAFE_ALLOCATE(ach_n1q, (sig%ntband,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(ach_n1, (sig%ntband,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(asxbis, (3,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achbis, (3,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(enew, (sig%ndiag,sig%nspin))
    SAFE_ALLOCATE(efsto, (sig%ndiag,sig%nspin))
    SAFE_ALLOCATE(zrenorm, (sig%ndiag,sig%nspin))
  endif
  if (sig%freq_dep.eq.2) then
    SAFE_ALLOCATE(asxDyn, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achDyn, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(ach2Dyn, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(asigDyn, (sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achtD_n1, (sig%ntband))
    SAFE_ALLOCATE(achD_n1q, (sig%ntband,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achD_n1, (sig%ntband,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(asxDbis, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(achDbis, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(ach2Dbis, (sig%nfreqeval,sig%ndiag+sig%noffdiag,sig%nspin))
    SAFE_ALLOCATE(efstoDyn, (sig%ndiag,sig%nspin))
    SAFE_ALLOCATE(asxtDyn, (sig%nfreqeval))
    SAFE_ALLOCATE(achtDyn, (sig%nfreqeval))
    SAFE_ALLOCATE(ach2tDyn, (sig%nfreqeval))
  endif
  SAFE_ALLOCATE(isrtrq, (gvec%ng))
  if (sig%freq_dep.eq.0.or.sig%exact_ch.eq.1) then
    SAFE_ALLOCATE(isrtrqi, (gvec%ng))
  endif
  SAFE_ALLOCATE(gvec%ekin, (gvec%ng))
  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2) then
    SAFE_ALLOCATE(isrtq, (gvec%ng))
    SAFE_ALLOCATE(isrtqi, (gvec%ng))
    SAFE_ALLOCATE(ind, (gvec%ng))
    SAFE_ALLOCATE(ph, (gvec%ng))
    if(sig%iwriteint.eq.1) then
      if (peinf%inode .eq. 0) then
        SAFE_ALLOCATE(epsmpi%isrtq, (gvec%ng,nq+1))
        SAFE_ALLOCATE(epsmpi%isrtqi, (gvec%ng,nq+1))
      endif
      SAFE_ALLOCATE(epsmpi%qk, (3,nq+1))
      SAFE_ALLOCATE(epsmpi%nmtx, (nq+1))
      SAFE_ALLOCATE(epsmpi%igown, (neps))
      SAFE_ALLOCATE(epsmpi%iggown, (neps))
    endif
  endif
  SAFE_ALLOCATE(qq, (3,nq+1))
 
!----------------------------
! Read eps^-1 from eps0mat/epsmat
!
! JRD: The matrices are read in from eps0mat/epsmat files and writen
! to temporary INT_EPS files on unit itpe. The q->0 matrix is not
! symmetrized. The wavevector q is also read from subroutine epscopy.

  if (peinf%inode.eq.0) call timacc(3,1,tsec)
  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2) then
    call logit('Calling epscopy')
    if (sig%iwriteint.eq.0) then
      
      call epscopy_disk(crys,gvec,nq,qq,sig,neps,itpe,fne,epshead)
      
    else if (sig%iwriteint.eq.1) then
      
      icurr=0
      epsmpi%igown=0
      epsmpi%iggown=0
      
      do ijk = 1,neps
        epsmpi%igown(ijk)=mod(ijk-1,peinf%npes)
        if (peinf%inode .eq. epsmpi%igown(ijk)) then
          icurr=icurr+1
          epsmpi%iggown(ijk)=icurr
        endif
      enddo
      
      epsmpi%ngown=icurr
      
      call epscopy_mpi(crys,gvec,nq,qq,sig,neps,epsmpi,epshead)

    endif  ! if (sig%iwriteint.eq.1)
  endif  ! if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2)
  if (sig%freq_dep.eq.-1) then
    nq=nq+1
    qq(:,:)=sig%qpt(:,:)
  endif
  if (peinf%inode.eq.0) call timacc(3,2,tsec)

!----------------------------
! Generate full Brillouin zone from irreducible wedge q -> gr%f

#ifdef VERBOSE
  if (peinf%inode.eq.0) then
    write(6,*)'*** VERBOSE: nq, syms%ntran= ',nq,syms%ntran
  endif
#endif
  
  if (peinf%inode.eq.0) call timacc(4,1,tsec)
  gr%nr = nq
  SAFE_ALLOCATE(gr%r, (3, nq))
  gr%r(1:3,1:nq) = qq(1:3,1:nq)
  call fullbz(crys,syms,gr,syms%ntran,skip_checkbz,wigner_seitz=.false.,paranoid=.true.)
  qshift(:)=0.0d0
  if (sig%freq_dep.eq.-1) then
    ! for Hartree-Fock, there is no epsmat/eps0mat file
    tmpfn="sigma.inp"
  else
    if (sig%igamma.ne.0) then
      tmpfn='eps0mat'
    else
      tmpfn='epsmat'
    endif
  endif
  if (.not. skip_checkbz) then
    call checkbz(gr%nf,gr%f,sig%qgrid,qshift,crys%bdot,tmpfn,'q',.false.,sig%freplacebz,sig%fwritebz)
  endif
  if (peinf%inode.eq.0) call timacc(4,2,tsec)
  

!-------- Start computation of sigma operator ----------------------------------

  
  fact = 1D0/(dble(gr%nf)*crys%celvol)
  coulfact = 8D0*PI_D/(dble(gr%nf)*crys%celvol)

!----------------------------
! Initialize distribution of epsilon

  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2) then
    
#ifdef VERBOSE
    call logit('Allocating eps')
    if (peinf%inode.eq.0) then
      write(6,*) '*** VERBOSE: size of eps, neps=',neps
    endif
#endif

    SAFE_ALLOCATE(igown, (neps))
    SAFE_ALLOCATE(iggown, (neps))
    
    icurr=0
    igown=0
    iggown=0
    
    do ijk = 1,neps
      igown(ijk)=mod(ijk-1,peinf%npes)
      if (peinf%inode .eq. igown(ijk)) then
        icurr=icurr+1
        iggown(ijk)=icurr
      endif
    enddo

    ngown=icurr

  endif

  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
    SAFE_ALLOCATE(eps, (neps,ngown))
    SAFE_ALLOCATE(epstemp, (neps))
  endif
  if (sig%freq_dep.eq.2) then
    SAFE_ALLOCATE(epsR, (sig%nFreq,neps,ngown))
    SAFE_ALLOCATE(epsRtemp, (sig%nFreq,neps))
#ifdef CPLX
    SAFE_ALLOCATE(epsA, (sig%nFreq,neps,ngown))
    SAFE_ALLOCATE(epsAtemp, (sig%nFreq,neps))
#endif
  endif


!---------- Check grid for uniformity

  if(peinf%inode == 0) call checkgriduniformity(sig%qgrid, crys, sig%icutv)

!-------- Loop over kpoints rkn-------------------------------------------------


  do ika=1,sig%nkn

!----------------------------
! Read wavefunctions for rkn (if sig%nkn.gt.1)
! (else it is already in wfnk)

    ikn = sig%indkn(ika)
    
    if (peinf%inode.eq.0) call timacc(19,1,tsec)
    
    if(sig%nkn.gt.1) then
      
      if (sig%iwriteint .eq. 0) then
        
        if (mod(peinf%inode,peinf%npes/peinf%npools).eq.0) then
          call open_file(itpk,file=fnk,form='unformatted',status='old')
          
          found = .false.
          do im=1,sig%nkn
            read(itpk) wfnk%nkpt,wfnk%ndv,nbandi,(wfnk%isrtk(j),j=1,gvec%ng),(qk(i),i=1,3)
            
! Check k-point

            if(all(abs(kp%rk(1:3,ikn)-qk(1:3)) .lt. TOL_Small)) then
              found = .true.
              in=im
              ndv_ikn=wfnk%ndv
            endif
          enddo

          if(.not. found) then
            call die('sigma main: k-point mismatch')
          endif
          
          ! why are we reading this file a second time?? --DAS
          SAFE_ALLOCATE(wfnk%zk, (ndv_ikn,sig%nspin))
          call close_file(itpk)
          call open_file(itpk,file=fnk,form='unformatted',status='old')
          do im=1,in-1
            read(itpk)
          enddo
          
          read(itpk) wfnk%nkpt,wfnk%ndv,nbandi,(wfnk%isrtk(j),j=1,gvec%ng),(qk(i),i=1,3), &
            ((wfnk%ek(j,k),j=1,nbandi),k=1,sig%nspin), &
            ((wfnk%elda(j,k),j=1,nbandi),k=1,sig%nspin), &
            ((wfnk%zk(j,k),j=1,ndv_ikn),k=1,sig%nspin)
          call close_file(itpk)
        endif
        
      else ! sig%iwriteint .eq. 0
        
        nbandi=sig%ntband
        wfnk%nkpt=wfnkmpi%nkptotal(ika)
        wfnk%ndv=peinf%ndiag_max*wfnk%nkpt
        wfnk%isrtk(1:wfnk%nkpt)=wfnkmpi%isort(1:wfnk%nkpt,ika)
        qk(1:3)=wfnkmpi%qk(1:3,ika)
        wfnk%ek(1:sig%ntband,1:sig%nspin) = wfnkmpi%el(1:sig%ntband,1:sig%nspin,ika)
        wfnk%elda(1:sig%ntband,1:sig%nspin) = wfnkmpi%elda(1:sig%ntband,1:sig%nspin,ika)
        SAFE_ALLOCATE(wfnk%zk, (wfnk%ndv,sig%nspin))
        wfnk%zk=ZERO
        do k=1,sig%nspin
#ifdef MPI
          tag=1024
          if (mod(wfnk%ndv,peinf%npes/peinf%npools).eq.0) then
            jj=wfnk%ndv/(peinf%npes/peinf%npools)
          else
            jj=wfnk%ndv/(peinf%npes/peinf%npools)+1
          endif
          do j=1,peinf%npools
            dest=(j-1)*(peinf%npes/peinf%npools)
            if (peinf%inode.eq.dest) then
              g1=1
              g2=min(jj,wfnk%ndv)
              if (g2.ge.g1) then
                wfnk%zk(g1:g2,k)=wfnkmpi%cg(g1:g2,k,ika)
              endif ! g2.ge.g1
            endif
            do i=2,peinf%npes/peinf%npools
              source=(i-1)+dest
              g1=1+(i-1)*jj
              g2=min(jj+(i-1)*jj,wfnk%ndv)
              if (g2.ge.g1) then
                if (peinf%inode.eq.source) &
                  call MPI_Send(wfnkmpi%cg(1,k,ika),g2-g1+1,MPI_SCALAR,dest,tag,MPI_COMM_WORLD,mpierr)
                if (peinf%inode.eq.dest) &
                  call MPI_Recv(wfnk%zk(g1,k),g2-g1+1,MPI_SCALAR,source,tag,MPI_COMM_WORLD,mpistatus,mpierr)
              endif ! g2.ge.g1
            enddo ! i=2,peinf%npes/peinf%npools
          enddo ! j=1,peinf%npools
#else
          wfnk%zk(1:wfnk%ndv,k)=wfnkmpi%cg(1:wfnk%ndv,k,ika)
#endif
        enddo ! k=1,sig%nspin
        
      endif ! sig%iwriteint .eq. 0

#ifdef MPI
      call MPI_Bcast(wfnk%nkpt,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
      call MPI_Bcast(wfnk%ndv,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
      call MPI_Bcast(nbandi,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
      call MPI_Bcast(wfnk%isrtk,gvec%ng,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
      call MPI_Bcast(qk,3,MPI_REAL_DP,0,MPI_COMM_WORLD,mpierr)
      call MPI_Bcast(wfnk%ek,nbandi*sig%nspin,MPI_REAL_DP,0,MPI_COMM_WORLD,mpierr)
      call MPI_Bcast(wfnk%elda,nbandi*sig%nspin,MPI_REAL_DP,0,MPI_COMM_WORLD,mpierr)

      if (sig%iwriteint .eq. 0) then
        if (mod(peinf%inode,peinf%npes/peinf%npools).ne.0) then
          SAFE_ALLOCATE(wfnk%zk, (wfnk%ndv,sig%nspin))
        endif
      endif
      
      if (peinf%npools.eq.1) then
        call MPI_Bcast(wfnk%zk,wfnk%ndv*sig%nspin,MPI_SCALAR,0,MPI_COMM_WORLD,mpierr)
      else
        tag=1024
        do j=1,peinf%npools
          source=(j-1)*(peinf%npes/peinf%npools)
          do i=2,peinf%npes/peinf%npools
            dest=(i-1)+source
            if (peinf%inode.eq.source) &
              call MPI_Send(wfnk%zk,wfnk%ndv*sig%nspin,MPI_SCALAR,dest,tag,MPI_COMM_WORLD,mpierr)
            if (peinf%inode.eq.dest) &
              call MPI_Recv(wfnk%zk,wfnk%ndv*sig%nspin,MPI_SCALAR,source,tag,MPI_COMM_WORLD,mpistatus,mpierr)
          enddo
        enddo
      endif
#endif

    endif ! sig%nkn.gt.1

    if (peinf%inode.eq.0) call timacc(19,2,tsec)
    
    if(peinf%inode.eq.0) then
      write(6,35) (kp%rk(j,ikn),j=1,3)
    endif
35  format(/,20x,'K=',3f8.3,/)
    
!----------------------------
! Initialize Matrix Elements

    alda=0.0d0
    ax=0.0d0
    axbis=0.0d0
    achcor(:,:)=(0.0d0,0.0d0)
    asig_imag(:,:)=(0.0d0,0.0d0)
    achcorbis(:,:)=(0.0d0,0.0d0)
    asig_imagbis(:,:)=(0.0d0,0.0d0)
    if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
      asx(:,:,:)=ZERO
      ach(:,:,:)=ZERO
      asig(:,:)=ZERO
      ach_n1q(:,:,:)=ZERO
      ach_n1(:,:,:)=ZERO
      asxbis(:,:,:)=ZERO
      achbis(:,:,:)=ZERO
    endif
    if (sig%freq_dep.eq.2) then
      asxDyn(:,:,:)=(0.0d0,0.0d0)
      achDyn(:,:,:)=(0.0d0,0.0d0)
      ach2Dyn(:,:,:)=(0.0d0,0.0d0)
      asigDyn(:,:)=(0.0d0,0.0d0)
      achD_n1q(:,:,:)=(0.0d0,0.0d0)
      achD_n1(:,:,:)=(0.0d0,0.0d0)
      asxDbis(:,:,:)=(0.0d0,0.0d0)
      achDbis(:,:,:)=(0.0d0,0.0d0)
      ach2Dbis(:,:,:)=(0.0d0,0.0d0)
    endif
    
!----------------------------
! Read matrix elements of Vxc from file vxc.dat
! or compute them on the fly from Vxc potential

    if (peinf%inode.eq.0) call timacc(5,1,tsec)
    if(sig%use_vxcdat) then
      if(peinf%inode == 0) write(6,*) 'Reading vxc.dat'
      call open_file(120,file='vxc.dat',form='formatted',status='old')
      qk(:)=INF
      ierr=0
      do while (ierr.eq.0)
        call read_matrix_elements_type(120, ierr, qk, sig, alda)
        if (all(abs(kp%rk(1:3,ikn)-qk(1:3)) .lt. TOL_Small)) exit
      enddo
      call close_file(120)
          
! Check k-point

      if(any(abs(kp%rk(1:3,ikn)-qk(1:3)) .ge. TOL_Small)) then
        call die('cannot find k-point in vxc.dat', only_root_writes = .true.)
      endif

! Divide by ryd for diag
! this will be undone by shift_energy routines later

      do s2=1,sig%nspin
        do in=1,sig%ndiag
          alda(in,s2) = alda(in,s2)/ryd
        enddo
      enddo

    else ! not using vxc.dat
      
      call logit('Calling mtxel_vxc')
      call mtxel_vxc(gvec,sig,wfnk,wfnkoff,alda)

    endif
    if (peinf%inode.eq.0) call timacc(5,2,tsec)
    
#ifdef CPLX
    if (any(abs(IMAG(alda(1:sig%ndiag,:)))*ryd > 1.0d-4)) imagvxcflag=.true.
#endif

!----------------------------
! Read ax from existing data

    if(sig%use_xdat .and. xflag) then
      if(peinf%inode == 0) write(6,*) 'Reading x.dat'
      call open_file(119,file='x.dat',form='formatted',status='old')
      qk(:)=INF
      ierr=0
      do while (ierr.eq.0)
        call read_matrix_elements_type(119, ierr, qk, sig, ax)
        if (all(abs(kp%rk(1:3,ikn)-qk(1:3)) .lt. TOL_Small)) exit
      enddo
      ax(:,:) = ax(:,:) * sig%xfrac
      call close_file(119)
      
! Check k-point

      if(any(abs(kp%rk(1:3,ikn)-qk(1:3)) .ge. TOL_Small)) then
        call die('cannot find k-point in x.dat', only_root_writes = .true.)
      endif

! Divide by ryd for diag

      do s2=1,sig%nspin
        do in=1,sig%ndiag
          ax(in,s2) = ax(in,s2)/ryd
        enddo
      enddo

#ifdef CPLX
    if (any(abs(IMAG(ax(1:sig%ndiag,:)))*ryd > 1.0d-4)) imagxflag=.true.
#endif

    endif ! using x.dat

!----------------------------
! Find subgroup which leaves kn invariant
! Indices of group operations in subgroup stored in array indsub
! stored in structure syms

    qk(:) = kp%rk(:,ikn)
    if (peinf%inode.eq.0) call timacc(6,1,tsec)
    if (sig%qgridsym) then
      call subgrp(qk,syms)
    else
      syms%ntranq=1
      syms%indsub(1)=1
      syms%kgzero(1:3,1)=0
    endif
    if (peinf%inode.eq.0) call timacc(6,2,tsec)

!----------------------------
! Reduce qpoints with respect to group of kn
! Keep track of number of q-points equivalent
! to give qpoint in irr-bz neq(irq)
!
! Keep track of q-point in the set given in epsmat
! RQ = R(Q) + G0 with transformation R and umklapp G0
! In order to unfold the inverse dielectric matrix from Q to RQ
!
! The q-points are the points for which we have epsilon
! (should be the unshifted irreducible grid)

    SAFE_ALLOCATE(indrq, (gr%nf))
    SAFE_ALLOCATE(neq, (gr%nf))
    SAFE_ALLOCATE(itnrq, (gr%nf))
    SAFE_ALLOCATE(rq, (3,gr%nf))
    SAFE_ALLOCATE(kg0, (3,gr%nf))
    if (peinf%inode.eq.0) call timacc(7,1,tsec)
    call irrbz(syms,gr%nf,gr%f,nrq,neq,indrq,rq,nq,qq,itnrq,kg0)
    if (peinf%inode.eq.0) call timacc(7,2,tsec)
    

!!---------- Loop over k-points in irr bz with respect to kn (rq) --------------

    do irq=1,nrq
      
      if(peinf%inode.eq.0) then
        write(6,60) irq,nrq
        write(6,*) ' '
      endif
60    format(/,3x,'qpoint',i5,' out of ',i5)

      ! Uncomment the line below to kill the job after one k-point.
      ! This is sometimes useful for debugging / performance testing.
      !if (irq .ne. 1) call die('debugging dead',only_root_writes = .true.)

!----------------------------
! Compute energies: |q+g|**2

      do j=1,gvec%ng
        qk(:) = rq(:,irq) + gvec%k(:,j)
        gvec%ekin(j) = DOT_PRODUCT(qk,MATMUL(crys%bdot,qk))
      enddo

!----------------------------
! Sort ekin in ascending order for this q
! The indices are placed in array isrtrq

      call sortrx_D(gvec%ng, gvec%ekin, isrtrq, gvec = gvec%k)
      if ((sig%freq_dep.eq.0.or.sig%exact_ch.eq.1).and.irq.eq.1) then
        isrtrqi=0
        do j=1,gvec%ng
          if (isrtrq(j).ge.1.and.isrtrq(j).le.gvec%ng) &
            isrtrqi(isrtrq(j))=j
        enddo
      endif

!----------------------------
! Compute cutoff in sums over G,G`

      ncoulb = gcutoff(gvec, isrtrq, sig%ecutb)
      ncouls = gcutoff(gvec, isrtrq, sig%ecuts)
      ncoul = max(ncouls,ncoulb)

      if (sig%freq_dep.eq.0.or.sig%exact_ch.eq.1) then
        if(irq.eq.1) ncoulch = ncoul
      else
        ncoulch = 0
      endif

      ! this condition is assumed later in the code (e.g. wpeff array sizes), so we must check it
      if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2) then
        if (ncouls.gt.neps) then
          write(tmpstr,'(a,i6,a,i6)')&
            "screened Coulomb cutoff is bigger than epsilon cutoff"//CHAR(10)&
            //' ncouls = ',ncouls,' neps = ',neps
          call die(tmpstr, only_root_writes = .true.)
        endif
      endif

!----------------------------
! Allocate arrays for q-point rq(:,irq)

      SAFE_ALLOCATE(vcoul, (ncoul))
      SAFE_ALLOCATE(aqs, (peinf%ntband_max,ncoul))
      if (sig%noffdiag.gt.0) then 
        SAFE_ALLOCATE(aqsaug, (peinf%ntband_max,ncoul,sig%ndiag,sig%nspin))
      endif
      if ((sig%freq_dep.eq.0.or.sig%exact_ch.eq.1).and.irq.eq.1) then
        SAFE_ALLOCATE(aqsch, (ncoulch))
        if (nrq.gt.1) then
          SAFE_ALLOCATE(aqsaugchd, (ncoulch,peinf%ndiag_max,sig%nspin))
          if (sig%noffdiag.gt.0) then
            SAFE_ALLOCATE(aqsaugcho, (ncoulch,peinf%noffdiag_max,sig%nspin))
          end if
        endif
      endif
      nm = indrq(irq)


!!!------- Calculate Vcoul -----------------------------------------------------


      if (peinf%inode.eq.0) call timacc(13,1,tsec)
      
      vq=rq(:,irq)
      
      qlen = sqrt(DOT_PRODUCT(vq,MATMUL(crys%bdot,vq)))
      
!      if (peinf%inode .eq. 0) then
!        write(6,*) 'Calculating Vcoul', sig%icutv, ncoul
!      endif
      
      call checkconsistency(sig%icutv,sig%iscreen,sig%q0vec,crys%bdot,sig%freq_dep)
      
      iparallel=1
      
      call vcoul_generator(sig%icutv,sig%truncval,gvec,crys%bdot, &
        gr%nf,ncoul,isrtrq,sig%iscreen,vq,sig%q0vec,vcoul, &
        sig%iwritecoul,iparallel,sig%avgcut,oneoverq,sig%qgrid,epshead, &
        work_scell,sig%averagew,sig%wcoul0)
        
      do ig = 1, ncoul
        vcoul(ig)=fact*vcoul(ig)
      enddo
      if (ika.eq.1.and.irq.eq.1) then
        sig%wcoul0 = sig%wcoul0 * fact
      endif

      if (peinf%inode.eq.0) call timacc(13,2,tsec)
!      if (peinf%inode.eq.0) write(6,*) 'Done Vcoul'
      

!!!------- Finished Calculating Vcoul ------------------------------------------


!!!------- Read inverse dielectric matrix for q-point rq(:,irq) ----------------


      if (peinf%inode.eq.0) call timacc(14,1,tsec)
      
      if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2) then
        
        q0len = sqrt(DOT_PRODUCT(sig%q0vec,MATMUL(crys%bdot,sig%q0vec)))
        
!----------------------------
! Processor 0 read eps^-1 for this q and broadcast to others
!
! Note: the total number of g-vectors used during
! computation of the inverse dielectric matrix
! may be different than in present calculation
! although the sets must coincide for small g
!
! JRD: We must find correct spot in EPS File
! Possible Performance Hazard
! Speed up might be breaking files up
!
! CHP: Now this hazard is gone with comm_mpi
!

        if(sig%iwriteint.eq.0) then
          
          if (peinf%inode.eq.0) then
            call open_file(itpe,file=fne,form='unformatted',status='old')
            read(itpe)
              
! CHP: "if(nm.ne.1) then" is NOT necessary since we have "do ii=1,nm-1"
!                if (nm .ne. 1) then
            do ii=1,nm-1
              read(itpe) ngqt,nmtxt
              if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
                do j=1,nmtxt
                  read(itpe)
                enddo
              else ! sig%freq_dep.eq.2
                do j=1,nmtxt
                  do i=1,nmtxt
                    read(itpe) ! epsR
                  enddo
#ifdef CPLX
                  do i=1,nmtxt
                    read(itpe) ! epsA
                  enddo
#endif
                enddo
              endif
            enddo
!                endif ! if (nm .ne. 1)

            read(itpe) ngq,nmtx,isrtq(1:ngq),isrtqi(1:ngq),(edummy,i=1,ngq),(qk(i),i=1,3)
          endif ! proc 0

#ifdef MPI
          call MPI_Bcast(ngq, 1, MPI_INTEGER, 0, MPI_COMM_WORLD,mpierr)
          call MPI_Bcast(nmtx, 1, MPI_INTEGER, 0, MPI_COMM_WORLD,mpierr)
          call MPI_Bcast(isrtq, ngq, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
          call MPI_Bcast(isrtqi, ngq, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
          call MPI_Bcast(qk, 3, MPI_REAL_DP, 0, MPI_COMM_WORLD, mpierr)
#endif
        else if (sig%iwriteint.eq.1) then
          ngq = gvec%ng
          nmtx = epsmpi%nmtx(nm)

          if (peinf%inode .eq. 0) then
            isrtq(:) = epsmpi%isrtq(:,nm)
            isrtqi(:) = epsmpi%isrtqi(:,nm)
          endif
#ifdef MPI
          call MPI_Bcast(isrtq, ngq, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
          call MPI_Bcast(isrtqi, ngq, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
#endif
          qk(:) = epsmpi%qk(:,nm)
        endif

! Find g=0 in main gvec list and eps gvector list

        call findvector(iout,0,0,0,gvec)
        iout = isrtqi(iout)
          
        if (peinf%inode.eq.0) write(6,*) 'Reading Eps Back'
        
        if (sig%iwriteint.eq.0) then
          
          do ijk = 1, nmtx

            if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
              epstemp=0D0
            else ! sig%freq_dep.eq.2
              epsRtemp=0D0
#ifdef CPLX
              epsAtemp=0D0
#endif
            endif
            
            if (peinf%inode .eq. 0) then
              
              if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
                read(itpe) (epstemp(i),i=1,nmtx)
              else ! sig%freq_dep.eq.2
                do i=1,nmtx
                  read(itpe) (epsRtemp(jj,i),jj=1,sig%nFreq)
                enddo
#ifdef CPLX
                do i=1,nmtx
                  read(itpe) (epsAtemp(jj,i),jj=1,sig%nFreq)
                enddo
#endif
              endif

!-------------------------------------------------------------------------------
! Now distribute it

              if(igown(ijk) .ne. 0) then
                tag = ijk + 1000
                
! JRD: Possible time Hazard.  Can we do this all in epscopy once and for all?
! CHP: Yes, use comm_mpi!

#ifdef MPI
                dest=igown(ijk)
                if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
                  call MPI_SEND(epstemp,neps,MPI_SCALAR,dest,tag,MPI_COMM_WORLD,mpierr)
                else ! sig%freq_dep.eq.2
                  call MPI_SEND(epsRtemp,sig%nFreq*neps,MPI_COMPLEX_DPC,dest,tag,MPI_COMM_WORLD,mpierr)
#ifdef CPLX
                  call MPI_SEND(epsAtemp,sig%nFreq*neps,MPI_COMPLEX_DPC,dest,tag,MPI_COMM_WORLD,mpierr)
#endif
                endif
#endif
              else if(igown(ijk) .eq. 0) then
                if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
                  eps(:,iggown(ijk))=epstemp(:)
                else ! sig%freq_dep.eq.2
                  epsR(:,:,iggown(ijk))=epsRtemp(:,:)
#ifdef CPLX
                  epsA(:,:,iggown(ijk))=epsAtemp(:,:)
#endif
                endif
#ifdef VERBOSE
                if (iggown(ijk) .eq. 0) then
                  write(0,699) 0, ijk, iggown(ijk)
                endif
#endif
              endif ! if(igown(ijk) .eq. 0)
              
            endif ! proc 0
              
            if (peinf%inode .eq. igown(ijk) .and. peinf%inode .ne. 0) then
              tag = ijk + 1000
#ifdef MPI
              if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
                call MPI_RECV(epstemp,neps,MPI_SCALAR,0,tag,MPI_COMM_WORLD,mpistatus,mpierr)
              else ! sig%freq_dep.eq.2
                call MPI_RECV(epsRtemp,sig%nFreq*neps,MPI_COMPLEX_DPC,0,tag,MPI_COMM_WORLD,mpistatus,mpierr)
#ifdef CPLX
                call MPI_RECV(epsAtemp,sig%nFreq*neps,MPI_COMPLEX_DPC,0,tag,MPI_COMM_WORLD,mpistatus,mpierr)
#endif
              endif
#endif
              if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
                eps(:,iggown(ijk))=epstemp(:)
              else !  sig%freq_dep.eq.2
                epsR(:,:,iggown(ijk))=epsRtemp(:,:)
#ifdef CPLX
                epsA(:,:,iggown(ijk))=epsAtemp(:,:)
#endif
              endif
#ifdef VERBOSE
              if (iggown(ijk) .eq. 0) then
                write(0,699) peinf%inode, ijk, iggown(ijk)
              endif
#endif
            endif ! proc non-0
            
699         format(1x,"WARNING: distribution of inverse epsilon matrix", &
              /,3x,"inode =",i5,1x,"ijk =",i6,1x,"iggown(ijk) =",i2,/)
            
          enddo  ! ijk
        endif  ! if (sig%iwriteint.eq.0)
        
        if (sig%iwriteint.eq.1) then
          if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
            eps(:,:)=epsmpi%eps(:,:,nm)
          else ! sig%freq_dep.eq.2
            epsR(:,:,:)=epsmpi%epsR(:,:,:,nm)
#ifdef CPLX
            epsA(:,:,:)=epsmpi%epsA(:,:,:,nm)
#endif
          endif
        endif ! if (sig%iwriteint.eq.1)
        
          
! CHP: By first distributing and then doing the wing fix, we can let
!      all the processors work together, thus, saving some time.
!      This is the case for both comm_mpi and comm_disk.

!-------------------------------------------------------------------------------
! Fix wing divergence for semiconductors and graphene

! This should really be done for all "|q+G| < avgcut" - but for now,
! it is done if "|q| < avgcut and G=0"

        do ijk = 1, nmtx
          
          if (peinf%inode.ne.igown(ijk)) cycle
          
          if (iggown(ijk).eq.0) write(6,*) 'ijk!!',peinf%inode,ijk
          
          if (nm .eq. 1) then
            q0flag=.true.
            if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
              call fixwings(vcoul(1),sig%wcoul0,eps(:,iggown(ijk)),sig%icutv, &
                sig%iscreen,ijk,nmtx,iout,q0len,oneoverq,fact,q0flag,sig%averagew,crys%bdot)
            else ! sig%freq_dep.eq.2
              call fixwings_dyn(vcoul(1),epsR(:,:,iggown(ijk)), &
                sig%icutv,sig%iscreen,ijk,sig%nFreq,nmtx,iout,q0len,oneoverq,fact,q0flag,crys%bdot)
#ifdef CPLX
              call fixwings_dyn(vcoul(1),epsA(:,:,iggown(ijk)), &
                sig%icutv,sig%iscreen,ijk,sig%nFreq,nmtx,iout,q0len,oneoverq,fact,q0flag,crys%bdot)
#endif
            endif
          else if (qlen**2 .lt. sig%avgcut) then
            q0flag=.false.
            if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
              call fixwings(vcoul(1),sig%wcoul0,eps(:,iggown(ijk)),sig%icutv, &
                sig%iscreen,ijk,nmtx,iout,qlen,oneoverq,fact,q0flag,sig%averagew,crys%bdot)
            else ! sig%freq_dep.eq.2
              call fixwings_dyn(vcoul(1),epsR(:,:,iggown(ijk)), &
                sig%icutv,sig%iscreen,ijk,sig%nFreq,nmtx,iout,qlen,oneoverq,fact,q0flag,crys%bdot)
#ifdef CPLX
              call fixwings_dyn(vcoul(1),epsA(:,:,iggown(ijk)), &
                sig%icutv,sig%iscreen,ijk,sig%nFreq,nmtx,iout,qlen,oneoverq,fact,q0flag,crys%bdot)
#endif
            endif
          endif ! q=0
          
        enddo ! ijk
        
        call logit('Read data from unit itpe or memory')
        if (peinf%inode.eq.0 .and. sig%iwriteint == 0) then
          write(6,*) 'Done Read Eps Back'
! use of below causes NaNs from bad optimization, blocked by statement containing NaN below
! bad at -O2, -O3, ok at -O1 and -O0 at least for gfortran. --DAS
          call close_file(itpe)
!          close(itpe)  ! using this line instead has no optimization issue
        endif
        
      endif  ! sig%freq_dep
      
      if (sig%freq_dep.eq.-1) then
        ngq=0
        nmtx=0
        qk(:)=qq(:,nm)
      endif
      
      if (peinf%inode.eq.0) call timacc(14,2,tsec)
      
#ifdef VERBOSE
      if (peinf%inode.eq.0) then
        write(6,'(3(a,i10))') ' *** VERBOSE: nmtx =',nmtx,' ncouls =',ncouls
        write(6,*)
      endif
#endif
      if(nmtx.gt.neps) then
        call die('nmtx.gt.neps')
      endif
        
! Check q-vector

      if(any(abs(qq(1:3, nm) - qk(1:3)) .gt. TOL_SMALL)) then
        write(0,*) peinf%inode,qq(:,nm),qk(:)
        call die('q-vector check wrong')
      endif
        
      if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2) then
        if(ncouls .gt. nmtx) ncouls = nmtx
      endif
      
      if(peinf%inode.eq.0) then
        if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
          write(6,100) (rq(i,irq),i=1,3),ncouls,dble(eps(1,1))
        endif
        if (sig%freq_dep.eq.2) then
          write(6,100) (rq(i,irq),i=1,3),ncouls,dble(epsR(1,1,1))
        endif
      endif
100   format(3x,'q=',3f6.3,2x,'n=',i6,2x,'eps(1,1)=',f6.2,2x,/)
      
! Map g-vectors required for eps**(-1)(r(q)) to those
! for the known matrix eps**(-1)(q) and calculate phases

      itran = itnrq(irq)
      kg(:) = kg0(:,irq)
      if (peinf%inode.eq.0) call timacc(8,1,tsec)
      call logit('Calling gmap')
      if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2) then
        ind(:)=0
        ph(:)=ZERO
        call gmap(gvec,syms,ncouls,itran,kg,isrtrq,isrtqi,ind,ph, &
         sig%die_outside_sphere) ! TROUBLE
      endif
      if (peinf%inode.eq.0) call timacc(8,2,tsec)
      

!!!------- Done reading inverse dielectric matrix for q-point rq(:,irq) --------


!--------------------
! Generate needed wavefunctions for rkq = rkn - rq
! stored in derived type wfnkq

      rkq(1:3) = kp%rk(1:3, ikn) - rq(1:3, irq)
      if (peinf%inode.eq.0) call timacc(9,1,tsec)
      call logit('Calling genwf')
      if (sig%iwriteint .eq. 0) then
        call genwf_disk(rkq,syms,gvec,crys,kp,sig,wfnkq,itpc,fnc)
      else
        call genwf_mpi(rkq,syms,gvec,crys,kp,sig,wfnkq,wfnkqmpi)
      endif
      if (peinf%inode.eq.0) call timacc(9,2,tsec)
      
      
      if (peinf%inode.eq.0) write(6,*)
      
!!-------- Loop over spins for which Sigma is computed -------------------------

      do ispin=1,sig%nspin


!!-------- Loop over bands for which diag Sigma is computed --------------------

! Bands are relabelled according to sig%diag(1:sig%ndiag)
! Loop is parallelized according to peinf%index_diag(1:peinf%ndiag_max)

        do in=1,peinf%ndiag_max
          
          if (peinf%inode.eq.0) then
            if (peinf%npools.eq.1) then
              write(6,999) peinf%index_diag(in)+(ispin-1)*sig%ndiag, &
                peinf%ndiag_max*peinf%npools*sig%nspin
            else
              write(6,997) peinf%index_diag(in)+(ispin-1)*sig%ndiag, &
                peinf%index_diag(in)+(ispin-1)*sig%ndiag+peinf%npools-1, &
                peinf%ndiag_max*peinf%npools*sig%nspin
            endif
          endif
999       format(1x,"Computing Sigma diag",i4,1x,"of",i4)
997       format(1x,"Computing Sigma diag",i4,1x,"to",i4,1x,"of",i4)
          
          write(tmpstr,*) 'Working on band ', sig%diag(peinf%index_diag(in)), ' 1st pool'
          call logit(tmpstr)

!---------------------
! Compute planewave matrix elements of g <n,k|exp{i(q+g).r}|n1,k-q>
! Note: wfnk%zk array keeps only the bands specified in sig%diag(:)
! Must keep track of the right band label

          call logit('Calling mtxel')
          if (peinf%inode.eq.0) call timacc(10,1,tsec)
          call mtxel(in,gvec,wfnkq,wfnk,ncoul,isrtrq,aqs,ispin)
          if (peinf%inode.eq.0) call timacc(10,2,tsec)
          if (sig%noffdiag.gt.0.and.peinf%flag_diag(in)) then
            do i=1,ncoul
              do j=1,peinf%ntband_node
                aqsaug(j,i,peinf%index_diag(in),ispin)=aqs(j,i)
              enddo
            enddo
          endif
          if (sig%freq_dep.eq.0.or.sig%exact_ch.eq.1) then
            if (irq.eq.1) then
              call logit('Calling mtxel_ch')
              if (peinf%inode.eq.0) call timacc(16,1,tsec)
              if (mod(peinf%inode,peinf%npes/peinf%npools).eq.0) then
                call mtxel_ch(in,in,gvec,wfnk,ncoulch,isrtrq,aqsch,ispin)
              else
                aqsch(:)=0.0d0
              endif
              if (peinf%inode.eq.0) call timacc(16,2,tsec)
              if (nrq.gt.1) aqsaugchd(:,in,ispin)=aqsch(:)
            else
              aqsch(:)=aqsaugchd(:,in,ispin)
            endif
          endif

!---------------------
! Compute diag SX and CH

          if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2) then
            call logit('Calling mtxel_sxch for diagonal matrix elements')
            if (peinf%inode.eq.0) call timacc(11,1,tsec)
          
            call mtxel_sxch(peinf%index_diag(in), &
              peinf%index_diag(in),ispin,ncouls,neps,gvec,eps,ph, &
              ind,isrtrqi,isrtrq,vcoul,crys,sig,wpg,wfnk,wfnkq,ncoulch, &
              aqs,aqs,aqsch,asigt_imag,acht_n1,asxt,acht,achtcor, &
              kp%nspin,rq(:,irq),coulfact,igown,iggown, &
              epsR,epsA,achtD_n1,asxtDyn,achtDyn,ach2tDyn,1)

            if (peinf%inode.eq.0) call timacc(11,2,tsec)
          else
            achtcor = ZERO
            asigt_imag = ZERO
          endif
          
          if (peinf%flag_diag(in)) then
            achcor(peinf%index_diag(in),ispin)= &
              achcor(peinf%index_diag(in),ispin)+neq(irq)*achtcor
            asig_imag(peinf%index_diag(in),ispin)= &
              asig_imag(peinf%index_diag(in),ispin)+neq(irq)*asigt_imag
            if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
              do iw=1,3
                asx(iw,peinf%index_diag(in),ispin)= &
                  asx(iw,peinf%index_diag(in),ispin)+neq(irq)*asxt(iw)
                ach(iw,peinf%index_diag(in),ispin)= &
                  ach(iw,peinf%index_diag(in),ispin)+neq(irq)*acht(iw)
              enddo
              do ib=1,sig%ntband
                ach_n1q(ib,peinf%index_diag(in),ispin)= &
                  ach_n1q(ib,peinf%index_diag(in),ispin)+neq(irq)*acht_n1(ib)
              enddo
            endif
            if (sig%freq_dep.eq.2) then
              do iw=1,sig%nfreqeval
                asxDyn(iw,peinf%index_diag(in),ispin)= &
                  asxDyn(iw,peinf%index_diag(in),ispin)+neq(irq)*asxtDyn(iw)
                achDyn(iw,peinf%index_diag(in),ispin)= &
                  achDyn(iw,peinf%index_diag(in),ispin)+neq(irq)*achtDyn(iw)
                ach2Dyn(iw,peinf%index_diag(in),ispin)= &
                  ach2Dyn(iw,peinf%index_diag(in),ispin)+neq(irq)*ach2tDyn(iw)
              enddo
              do ib=1,sig%ntband
                achD_n1q(ib,peinf%index_diag(in),ispin)= &
                  achD_n1q(ib,peinf%index_diag(in),ispin)+neq(irq)*achtD_n1(ib)
              enddo
            endif
          endif
          
!---------------------
! Compute diag bare exchange (SKIP this computation if you already know it)

          if (peinf%inode.eq.0) call timacc(18,1,tsec)
          if(.not. sig%use_xdat .and. xflag) then
            call logit('Computing bare X')
            axt=0.0d0
            do ig=1,ncoulb
              igad=isrtrq(ig)
              ax_ig=0.0d0
                
              do n1=1,peinf%nvband_node
                tempval = wfnkq%ekq(peinf%indext(n1),ispin) - sig%efermi
                if (tempval < TOL_Degeneracy) then
                  if(abs(tempval) < TOL_Degeneracy) then
                    occ=0.5  ! Fermi-Dirac distribution = 1/2 at Fermi level
                  else
                    occ=1D0
                  endif
                  ax_ig = ax_ig + abs(aqs(n1,ig))**2 * occ
                endif
                ! sig%ncrit = 0 and tempval > TOL_Degeneracy should never happen!
              enddo
              axt=axt+ax_ig*vcoul(ig)
            enddo
            if (peinf%flag_diag(in)) then
              ax(peinf%index_diag(in),ispin)=ax(peinf%index_diag(in),ispin)-neq(irq)*axt*sig%xfrac
            endif
          endif ! not use x.dat
          if (peinf%inode.eq.0) call timacc(18,2,tsec)
          
        enddo ! in (loop over bands for which we need diag Sigma)
        
        if (ispin.eq.sig%nspin) then
          SAFE_DEALLOCATE_P(wfnkq%zkq)
          SAFE_DEALLOCATE(aqs)
          if ((sig%freq_dep.eq.0.or.sig%exact_ch.eq.1).and.irq.eq.nrq) then
            if (nrq.gt.1) then
              SAFE_DEALLOCATE(aqsaugchd)
            end if
          endif
        endif
        
        if (peinf%inode.eq.0) write(6,*)

!!-------- End diag band loop --------------------------------------------------

! (gsm) begin distributing aqsaug matrix elements for offdiag calculation

! $$$ inefficient communication, this should be rewritten $$$

        if (peinf%inode.eq.0) call timacc(17,1,tsec)
#ifdef MPI
        if (sig%noffdiag.gt.0.and.peinf%npools.gt.1) then
          tag=1024
          do in=1,sig%ndiag
            jj=mod(in-1,peinf%npools)+1
            do i=1,peinf%npes/peinf%npools
              source=(i-1)+(jj-1)*(peinf%npes/peinf%npools)
              do j=1,peinf%npools
                dest=(i-1)+(j-1)*(peinf%npes/peinf%npools)
                if (j.ne.jj) then
                  if (peinf%inode.eq.source) &
                    call MPI_Send(aqsaug(:,:,in,ispin),peinf%ntband_max*ncoul,MPI_SCALAR,dest,tag,MPI_COMM_WORLD,mpierr)
                  if (peinf%inode.eq.dest) &
                    call MPI_Recv(aqsaug(:,:,in,ispin),peinf%ntband_max*ncoul,MPI_SCALAR, &
                      source,tag,MPI_COMM_WORLD,mpistatus,mpierr)
                endif
              enddo
            enddo
          enddo
        endif ! sig%noffdiag.gt.0.and.peinf%npools.gt.1
#endif
        if (peinf%inode.eq.0) call timacc(17,2,tsec)
        
! (gsm) end distributing aqsaug matrix elements for offdiag calculation

!!-------- Loop over bands for which offdiag Sigma is computed -----------------

! Bands are relabelled according to sig%off*(1:sig%noffdiag)
! Loop is parallelized according to peinf%index_offdiag(1:peinf%noffdiag_max)

        do ioff=1,peinf%noffdiag_max
          
          if (peinf%inode.eq.0) then
            if (peinf%npools.eq.1) then
              write(6,998) peinf%index_offdiag(ioff)+(ispin-1)*sig%noffdiag, &
                peinf%noffdiag_max*peinf%npools*sig%nspin
            else
              write(6,996) peinf%index_offdiag(ioff)+(ispin-1)*sig%noffdiag, &
                peinf%index_offdiag(ioff)+(ispin-1)*sig%noffdiag+peinf%npools-1, &
                peinf%noffdiag_max*peinf%npools*sig%nspin
            endif
          endif
998       format(1x,"Computing Sigma offdiag",i4,1x,"of",i4)
996       format(1x,"Computing Sigma offdiag",i4,1x,"to",i4,1x,"of",i4)
          
          write(tmpstr,'(a,2i6,a)') 'Working on bands ', sig%off1(peinf%index_offdiag(ioff)), &
            sig%off2(peinf%index_offdiag(ioff)), ' 1st pool'
          call logit(tmpstr)

! <n|Sigma|m> = 0 if n and m belong to different irreducible representations
! Even without assigning representations, we can tell they are different if
! the size of the degenerate subspace is different for n and m.
! This saves time and helps enforce symmetry. -- DAS
          bExactlyZero = .false.
          if(.not. sig%wfn_outer_present .and. sig%offdiagsym .and. &
             kp%degeneracy(sig%off1(peinf%index_offdiag(ioff)), ikn, ispin) /= &
             kp%degeneracy(sig%off2(peinf%index_offdiag(ioff)), ikn, ispin)) then
            if (peinf%inode.eq.0) write(6,'(a)') 'Zero by symmetry -- not computing.'
            ! the matrix elements are zeroed at the beginning, and at the end of the loop
            ! so we can just leave them as they are
            ! JRD - We cannot cycle here because other pools may have legitimate work to do
            ! and we need to be around for the communication.  Particularly inside the subroutine
            ! mtxel_sxch. Thus we can`t really save time here. We could still zero out elements 
            ! at bottom of loop if want. I leave it up to DAS.
            !cycle
            bExactlyZero=.true.
          endif
            
!---------------------
! Compute planewave matrix elements for exact static CH

          if (sig%freq_dep.eq.0.or.sig%exact_ch.eq.1) then
            if (irq.eq.1) then
              if (peinf%inode.eq.0) call timacc(20,1,tsec)
              wfnkoff%nkpt=wfnk%nkpt
              if (mod(peinf%inode,peinf%npes/peinf%npools).eq.0) then
                SAFE_ALLOCATE(wfnkoff%isrtk, (gvec%ng))
                wfnkoff%isrtk=wfnk%isrtk
                SAFE_ALLOCATE(wfnkoff%zk, (2*wfnk%nkpt,1))
              endif
                
! (gsm) begin gathering wavefunctions over pools

! $$$ inefficient communication, this should be rewritten $$$

              do jj=1,peinf%npools
                dest=(jj-1)*(peinf%npes/peinf%npools)
                do ii=1,2
                  i=sig%offmap(peinf%index_offdiag(ioff),ii)
#ifdef MPI
                  call MPI_Bcast(i,1,MPI_INTEGER,dest,MPI_COMM_WORLD,mpierr)
#endif
                  j=(i-1)/peinf%npools+1
                  source=mod(i-1,peinf%npools)*(peinf%npes/peinf%npools)
                  if (peinf%inode.eq.source.and.peinf%inode.eq.dest) then
                    do k=1,wfnk%nkpt
                      wfnkoff%zk((ii-1)*wfnk%nkpt+k,1) = wfnk%zk((j-1)*wfnk%nkpt+k,ispin)
                    enddo
                  else
#ifdef MPI
                    tag=1024
                    if (peinf%inode.eq.source) &
                      call MPI_Send(wfnk%zk((j-1)*wfnk%nkpt+1,ispin),wfnk%nkpt, &
                        MPI_SCALAR,dest,tag,MPI_COMM_WORLD,mpierr)
                    if (peinf%inode.eq.dest) &
                      call MPI_Recv(wfnkoff%zk((ii-1)*wfnk%nkpt+1,1),wfnk%nkpt, &
                        MPI_SCALAR,source,tag,MPI_COMM_WORLD,mpistatus,mpierr)
#else
                    do k=1,wfnk%nkpt
                      wfnkoff%zk((ii-1)*wfnk%nkpt+k,1) = wfnk%zk((j-1)*wfnk%nkpt+k,ispin)
                    enddo
#endif
                  endif
                enddo
              enddo
              
! (gsm) end gathering wavefunctions over pools

              if (peinf%inode.eq.0) call timacc(20,2,tsec)
              call logit('Calling mtxel_ch')
              if (peinf%inode.eq.0) call timacc(16,1,tsec)
              if (mod(peinf%inode,peinf%npes/peinf%npools).eq.0) then
                call mtxel_ch(1,2,gvec,wfnkoff,ncoulch,isrtrq,aqsch,1)
              else
                aqsch(:)=0.0d0
              endif
              if (peinf%inode.eq.0) call timacc(16,2,tsec)
              if (mod(peinf%inode,peinf%npes/peinf%npools).eq.0) then
                SAFE_DEALLOCATE_P(wfnkoff%isrtk)
                SAFE_DEALLOCATE_P(wfnkoff%zk)
              endif
              if (nrq.gt.1) aqsaugcho(:,ioff,ispin)=aqsch(:)
            else
              aqsch(:)=aqsaugcho(:,ioff,ispin)
            endif
          endif ! sig%freq_dep.eq.0.or.sig%exact_ch.eq.1

!---------------------
! Compute offdiag SX and CH

          if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2) then
            call logit('Calling mtxel_sxch for offdiagonal matrix elements')
            if (peinf%inode.eq.0) call timacc(11,1,tsec)
          
            call mtxel_sxch(sig%offmap(peinf%index_offdiag(ioff),1), &
              sig%offmap(peinf%index_offdiag(ioff),3), &
              ispin,ncouls,neps,gvec,eps,ph,ind,isrtrqi, &
              isrtrq,vcoul,crys,sig,wpg,wfnk,wfnkq,ncoulch, &
              aqsaug(:,:,sig%offmap(peinf%index_offdiag(ioff),1),ispin), &
              aqsaug(:,:,sig%offmap(peinf%index_offdiag(ioff),2),ispin), &
              aqsch,asigt_imag,acht_n1,asxt,acht,achtcor,kp%nspin, &
              rq(:,irq),coulfact,igown,iggown, &
              epsR,epsA,achtD_n1,asxtDyn,achtDyn,ach2tDyn,2)

            if (peinf%inode.eq.0) call timacc(11,2,tsec)
          endif
          
          if (bExactlyZero) cycle
         
          if (peinf%flag_offdiag(ioff)) then
            achcor(peinf%index_offdiag(ioff)+sig%ndiag,ispin)= &
              achcor(peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*achtcor
            asig_imag(peinf%index_offdiag(ioff)+sig%ndiag,ispin)= &
              asig_imag(peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*asigt_imag
            if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
              do iw=1,3
                asx(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)= &
                  asx(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*asxt(iw)
                ach(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)= &
                  ach(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*acht(iw)
              enddo
              do ib=1,sig%ntband
                ach_n1q(ib,peinf%index_offdiag(ioff)+sig%ndiag,ispin) = &
                  ach_n1q(ib,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+neq(irq)*acht_n1(ib)
              enddo
            endif
            if (sig%freq_dep.eq.2) then
              do iw=1,sig%nfreqeval
                asxDyn(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin) = &
                  asxDyn(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*asxtDyn(iw)
                achDyn(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin) = &
                  achDyn(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*achtDyn(iw)
                ach2Dyn(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin) = &
                  ach2Dyn(iw,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+ryd*neq(irq)*ach2tDyn(iw)
              enddo
              do ib=1,sig%ntband
                achD_n1q(ib,peinf%index_offdiag(ioff)+sig%ndiag,ispin) = &
                  achD_n1q(ib,peinf%index_offdiag(ioff)+sig%ndiag,ispin)+neq(irq)*achtD_n1(ib)
              enddo
            endif
          endif
          
!---------------------
! Compute offdiag bare exchange (SKIP this computation if you already know it)

          if (peinf%inode.eq.0) call timacc(18,1,tsec)
          if(.not. sig%use_xdat .and. xflag) then
            axt=0.0d0
            do ig=1,ncoulb
              igad=isrtrq(ig)
              ax_ig=0.0d0
              do n1=1,peinf%nvband_node
                tempval = wfnkq%ekq(peinf%indext(n1),ispin) - sig%efermi
                if (tempval < TOL_Degeneracy) then
                  if(abs(tempval) < TOL_Degeneracy) then
                    occ=0.5  ! Fermi-Dirac distribution = 1/2 at Fermi level
                  else
                    occ=1D0
                  endif
                  ax_ig=ax_ig+aqsaug(n1,ig,sig%offmap(peinf%index_offdiag(ioff),1),ispin) &
                    *MYCONJG(aqsaug(n1,ig,sig%offmap(peinf%index_offdiag(ioff),2),ispin))*occ
                endif
              enddo
              axt=axt+ax_ig*vcoul(ig)
            enddo
            if (peinf%flag_offdiag(ioff)) then
              ax(peinf%index_offdiag(ioff)+sig%ndiag,ispin)= &
                ax(peinf%index_offdiag(ioff)+sig%ndiag,ispin) - neq(irq)*axt*ryd*sig%xfrac
            endif
          endif ! not using x.dat
          if (peinf%inode.eq.0) call timacc(18,2,tsec)
          
        enddo ! ioff (loop over bands for which we need offdiag Sigma)
        
        if (ispin.eq.sig%nspin) then
          SAFE_DEALLOCATE(vcoul)
          if (sig%noffdiag.gt.0) then
            SAFE_DEALLOCATE(aqsaug)
          end if
          if ((sig%freq_dep.eq.0.or.sig%exact_ch.eq.1).and.irq.eq.nrq) then
            SAFE_DEALLOCATE(aqsch)
            if (nrq.gt.1.and.sig%noffdiag.gt.0) then
              SAFE_DEALLOCATE(aqsaugcho)
            end if
          endif
        endif
          
!!-------- End offdiag band loop -----------------------------------------------


      enddo ! ispin

!!-------- End spin loop -------------------------------------------------------

      if (peinf%inode.eq.0) write(6,*)
      
!---------------------
! Add up contributions from all PEs

      if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
        SAFE_ALLOCATE(xdum1, (9*(sig%ndiag+sig%noffdiag),sig%nspin))
        do ispin=1,sig%nspin
          do in=1,sig%ndiag+sig%noffdiag
            idum = (in - 1) * 9
            xdum1(idum+1,ispin) = ax(in,ispin)
            xdum1(idum+2,ispin) = asx(1,in,ispin)
            xdum1(idum+3,ispin) = asx(2,in,ispin)
            xdum1(idum+4,ispin) = asx(3,in,ispin)
            xdum1(idum+5,ispin) = ach(1,in,ispin)
            xdum1(idum+6,ispin) = ach(2,in,ispin)
            xdum1(idum+7,ispin) = ach(3,in,ispin)
            xdum1(idum+8,ispin) = SCALARIFY(achcor(in,ispin))
            xdum1(idum+9,ispin) = SCALARIFY(asig_imag(in,ispin))
          enddo
        enddo
      endif
      if (sig%freq_dep.eq.2) then
        nfold = (3*sig%nfreqeval+2)            
        SAFE_ALLOCATE(xdum1Dyn, (nfold*(sig%ndiag+sig%noffdiag),sig%nspin))
        do ispin=1,sig%nspin
          do in=1,sig%ndiag+sig%noffdiag
            idum = (in - 1) * nfold
            xdum1Dyn(idum+1,ispin) = ax(in,ispin)
            do ifold = 1, sig%nfreqeval
              xdum1Dyn(idum+1+ifold,ispin) = asxDyn(ifold,in,ispin)
            enddo
            do ifold = 1, sig%nfreqeval
              xdum1Dyn(idum+1+sig%nfreqeval+ifold,ispin) = achDyn(ifold,in,ispin)
            enddo
            do ifold = 1, sig%nfreqeval
              xdum1Dyn(idum+1+2*sig%nfreqeval+ifold,ispin) = ach2Dyn(ifold,in,ispin)
            enddo
            xdum1Dyn(idum+3*sig%nfreqeval+2,ispin) = achcor(in,ispin)
          enddo
        enddo
      endif
      
#ifdef MPI
      call logit('Calling MPI_Allreduce xdum1 to xdum2')
#endif
      if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
        SAFE_ALLOCATE(xdum2, (9*(sig%ndiag+sig%noffdiag),sig%nspin))
        ndum=(sig%ndiag+sig%noffdiag)*9*sig%nspin
#ifdef MPI
        call MPI_Allreduce(xdum1(1,1),xdum2(1,1),ndum,MPI_SCALAR,MPI_SUM,MPI_COMM_WORLD,mpierr)
#else
        xdum2=xdum1
#endif
        SAFE_DEALLOCATE(xdum1)
      endif
      if (sig%freq_dep.eq.2) then
        ndum=(sig%ndiag+sig%noffdiag)*nfold*sig%nspin
        SAFE_ALLOCATE(xdum2Dyn, (nfold*(sig%ndiag+sig%noffdiag),sig%nspin))
#ifdef MPI
        call MPI_Allreduce(xdum1Dyn(1,1),xdum2Dyn(1,1),ndum,MPI_COMPLEX_DPC,MPI_SUM,MPI_COMM_WORLD,mpierr)
#else
        xdum2Dyn=xdum1Dyn
#endif
        SAFE_DEALLOCATE(xdum1Dyn)
      endif
      call logit('Done reduction of xdum1 to xdum2')
      
      if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
        do ispin=1,sig%nspin
          do in=1,sig%ndiag+sig%noffdiag
            idum = (in - 1) * 9
            axbis(in,ispin) = axbis(in,ispin) + xdum2(idum+1,ispin)
            asxbis(1,in,ispin) = asxbis(1,in,ispin) + xdum2(idum+2,ispin)
            asxbis(2,in,ispin) = asxbis(2,in,ispin) + xdum2(idum+3,ispin)
            asxbis(3,in,ispin) = asxbis(3,in,ispin) + xdum2(idum+4,ispin)
            achbis(1,in,ispin) = achbis(1,in,ispin) + xdum2(idum+5,ispin)
            achbis(2,in,ispin) = achbis(2,in,ispin) + xdum2(idum+6,ispin)
            achbis(3,in,ispin) = achbis(3,in,ispin) + xdum2(idum+7,ispin)
            achcorbis(in,ispin) = achcorbis(in,ispin) + xdum2(idum+8,ispin)
            asig_imagbis(in,ispin) = asig_imagbis(in,ispin) + xdum2(idum+9,ispin)
            if(.not. sig%use_xdat) ax(in,ispin) = ZERO
            asx(1:3,in,ispin) = ZERO
            ach(1:3,in,ispin) = ZERO
            achcor(in,ispin) = (0.0d0,0.0d0)
            asig_imag(in,ispin) = (0.0d0,0.0d0)
          enddo
        enddo
        SAFE_DEALLOCATE(xdum2)
      endif
      if (sig%freq_dep.eq.2) then
        do ispin=1,sig%nspin
          do in=1,sig%ndiag+sig%noffdiag
            idum = (in - 1) * nfold
            axbis(in,ispin) = axbis(in,ispin) + xdum2Dyn(idum+1,ispin)
            do ifold = 1, sig%nfreqeval
              asxDbis(ifold,in,ispin) = asxDbis(ifold,in,ispin) + xdum2Dyn(idum+1+ifold,ispin)
            enddo
            do ifold = 1, sig%nfreqeval
              achDbis(ifold,in,ispin) = achDbis(ifold,in,ispin) + xdum2Dyn(idum+1+sig%nfreqeval+ifold,ispin)
            enddo
            do ifold = 1, sig%nfreqeval
              ach2Dbis(ifold,in,ispin) = ach2Dbis(ifold,in,ispin) + &
                xdum2Dyn(idum+1+2*sig%nfreqeval+ifold,ispin)
            enddo
            achcorbis(in,ispin) = achcorbis(in,ispin) + xdum2Dyn(idum+3*sig%nfreqeval+2,ispin)
            if(.not. sig%use_xdat) ax(in,ispin) = ZERO
            asxDyn(:,in,ispin) = (0.0d0,0.0d0)
            achDyn(:,in,ispin) = (0.0d0,0.0d0)
            ach2Dyn(:,in,ispin) = (0.0d0,0.0d0)
            achcor(in,ispin) = (0.0d0, 0.0d0)
          enddo
        enddo
        SAFE_DEALLOCATE(xdum2Dyn)
      endif
      
      in = sig%ntband * (sig%ndiag + sig%noffdiag) * sig%nspin
      if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
        SAFE_ALLOCATE(xdum3, (sig%ntband,sig%ndiag+sig%noffdiag,sig%nspin))
        xdum3 = ach_n1q
#ifdef MPI
        call MPI_Allreduce(xdum3(1,1,1),ach_n1q(1,1,1),in,MPI_SCALAR,MPI_SUM,MPI_COMM_WORLD,mpierr)
#endif
        SAFE_DEALLOCATE(xdum3)
        do ispin = 1, sig%nspin
          do in = 1, sig%ndiag + sig%noffdiag
            do ib = 1, sig%ntband
              ach_n1(ib,in,ispin) = ach_n1(ib,in,ispin) + ach_n1q(ib,in,ispin)
              ach_n1q(ib,in,ispin) = 0.0d0
            enddo
          enddo
        enddo
      endif
      if (sig%freq_dep.eq.2) then
        SAFE_ALLOCATE(xdum3Dyn, (sig%ntband,sig%ndiag+sig%noffdiag,sig%nspin))
        xdum3Dyn = achD_n1q
#ifdef MPI
        call MPI_Allreduce(xdum3Dyn(1,1,1),achD_n1q(1,1,1),in,MPI_COMPLEX_DPC,MPI_SUM,MPI_COMM_WORLD,mpierr)
#endif
        SAFE_DEALLOCATE(xdum3Dyn)
        do ispin = 1, sig%nspin
          do in = 1, sig%ndiag + sig%noffdiag
            do ib = 1, sig%ntband
              achD_n1(ib,in,ispin) = achD_n1(ib,in,ispin) + achD_n1q(ib,in,ispin)
              achD_n1q(ib,in,ispin) = 0.0d0
            enddo
          enddo
        enddo
      endif
    enddo ! irq (loop over rq point in BZ summation)
    

!!-------- End loop over k-points in irr bz with respect to kn (rq) ------------


    if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
      do ispin=1,sig%nspin
        do in=1,sig%ndiag+sig%noffdiag
          if(.not. sig%use_xdat) ax(in,ispin) = axbis(in,ispin)
          asx(:,in,ispin) = asxbis(:,in,ispin)
          ach(:,in,ispin) = achbis(:,in,ispin)
          achcor(in,ispin) = achcorbis(in,ispin)
          asig_imag(in,ispin) = asig_imagbis(in,ispin)
        enddo
      enddo
    endif
    if (sig%freq_dep.eq.2) then
      do ispin=1,sig%nspin
        do in=1,sig%ndiag+sig%noffdiag
          if(.not. sig%use_xdat) ax(in,ispin) = axbis(in,ispin)
          asxDyn(:,in,ispin) = asxDbis(:,in,ispin)
          achDyn(:,in,ispin) = achDbis(:,in,ispin)
          ach2Dyn(:,in,ispin) = ach2Dbis(:,in,ispin)
          achcor(in,ispin) = achcorbis(in,ispin)
        enddo
      enddo
    endif
    
!----------------------------
! Output unsymmetrized values of X,SX,CH
! Symmetrize X,SX,CH matrix elements over degenerate states
! Convert to eV and output symmetrized values of X,SX,CH

    if (peinf%inode.eq.0) then

      ! This beautiful piece of code should never be executed and here only because it somehow
      ! prevents gfortran from an incorrect optimization of this routine that will produce NaNs.
      ! It has some mysterious relation to the ijk loop above reading from itpe.
      ! Insanely, the presence of function 'isNaN' appears to be crucial here. --DAS
#ifdef GNU
      if(ikn == -1) then
        call die("BUG: ikn = -1")
        write(0,*) isNaN(fact)
      endif
#endif

      do ispin=1,sig%nspin
        write(6,989)ikn,sig%spin_index(ispin)
        if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
          write(6,988)
          do i=1,sig%ndiag
            write(6,987)sig%diag(i),wfnk%elda(sig%diag(i),ispin), &
              wfnk%ek(sig%diag(i),ispin),dble(ax(i,ispin))*ryd, &
              dble(asx(2,i,ispin))*ryd,dble(ach(2,i,ispin)+achcor(i,ispin))*ryd, &
              dble(ax(i,ispin)+asx(2,i,ispin)+ach(2,i,ispin)+achcor(i,ispin))*ryd, &
              dble(alda(i,ispin))*ryd,dble(asig_imag(i,ispin)*ryd)
          enddo
        endif
          
        if (sig%freq_dep.eq.2) then
          write(6,888)
          do i=1,sig%ndiag
            
! JRD: Find iw closest to e_lk

            diffmin = INF
            e_lk = wfnk%ek(sig%diag(i),ispin)
            do iw=1,sig%nfreqeval
              diff = abs(sig%freqevalmin + (iw-1)*sig%freqevalstep - e_lk)
              if (diff .lt. diffmin) then
                diffmin=diff
                iwlda=iw
              endif
            enddo
            
            write(6,887)sig%diag(i),wfnk%elda(sig%diag(i),ispin), &
              wfnk%ek(sig%diag(i),ispin),dble(ax(i,ispin))*ryd, &
              dble(asxDyn(iwlda,i,ispin))*ryd,dble(achDyn(iwlda,i,ispin)+achcor(i,ispin))*ryd, &
              dble(ax(i,ispin)+asxDyn(iwlda,i,ispin)+achDyn(iwlda,i,ispin)+achcor(i,ispin))*ryd, &
              dble(alda(i,ispin))*ryd, &
              IMAG(asxDyn(iwlda,i,ispin))*ryd,IMAG(achDyn(iwlda,i,ispin))*ryd, &
              IMAG(ax(i,ispin)+asxDyn(iwlda,i,ispin)+achDyn(iwlda,i,ispin))*ryd, &
              IMAG(ax(i,ispin)+asxDyn(iwlda,i,ispin)+ach2Dyn(iwlda,i,ispin))*ryd
          enddo
        endif
      enddo
    endif ! node 0
989 format(1x,"Unsymmetrized values for ik =",i4,1x,"spin =",i2)
988 format(/,3x,"n",5x,"elda",5x,"ecor",8x,"x",5x,"sx-x",7x,"ch",6x,"sig",6x,"vxc",2x,"Sig Im. GPP")
987 format(i4,8f9.3)
888 format(/,3x,"n",4x,"elda",5x,"ecor",8x,"x",2x,"re sx-x",4x,"re ch",3x,"re sig",6x,"vxc",/,&
      33x,"im sx-x",4x,"im ch",3x,"im sig",2x,"im sig2")
887 format(i4,7f9.3,/,28x,4f9.3)
      
!----------------------------
! Symmetrize matrix elements

    if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
      call shiftenergy(sig,wfnk,alda,asx,ach,achcor,ax,efsto,asig,enew,zrenorm)
    endif
    if (sig%freq_dep.eq.2) then
      call shiftenergy_dyn(sig,wfnk,alda,asxDyn,achDyn,ach2Dyn,achcor,ax,efstoDyn,asigDyn,ikn,kp)
    endif
    
!----------------------------
! Write out matrix elements

    if (peinf%inode.eq.0) then
      write(6,'(a)')
      write(6,'(a)') ' Symmetrized values from band-averaging:'

      if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
        call write_result(kp,wfnk,sig,ach_n1,ax,asx,ach,achcor,asig,alda,efsto,enew,zrenorm,ikn) ! wtf
        call write_result_hp(kp,wfnk,sig,ax,asx,ach,achcor,asig,alda,efsto,enew,zrenorm,ikn)
        if (sig%freq_dep.eq.1) then
          do ispin=1,sig%nspin
            do i=1,sig%ndiag
              if (abs(efsto(i,ispin)+SCALARIFY(achcor(i,ispin)) - wfnk%ek(sig%diag(i),ispin)).gt.sig%dw) then
                eqp1flag=.true.
              endif
            enddo
          enddo
        endif
      endif
      
      if (sig%freq_dep.eq.2) then
        call write_result_dyn(kp,wfnk,sig,achD_n1,ax,asxDyn,achDyn,achcor,asigDyn,alda,efstoDyn,ikn)
        call write_result_dyn_hp(kp,wfnk,sig,ax,asxDyn,achDyn,achcor,asigDyn,alda,efstoDyn,ikn)
      endif
    endif

    if(peinf%inode == 0) then

!----------------------------
! If not using vxc.dat, create and write Vxc in it

      call timacc(5,1,tsec)
      if(.not. sig%use_vxcdat) then
        call write_matrix_elements_type(120, kp%rk(:, ikn), sig, COMPLEXIFY(alda(:,:)))
      endif
      call timacc(5,2,tsec)
      
!----------------------------
! If not using x.dat, create and write X in it

      call timacc(18,1,tsec)
      if((.not. sig%use_xdat) .and. xflag) then
        ax(:,:) = ax(:,:) / sig%xfrac
        call write_matrix_elements_type(119, kp%rk(:, ikn), sig, COMPLEXIFY(ax(:,:)))
      endif
      call timacc(18,2,tsec)
      
    endif
    
    SAFE_DEALLOCATE(indrq)
    SAFE_DEALLOCATE(neq)
    SAFE_DEALLOCATE(itnrq)
    SAFE_DEALLOCATE(rq)
    SAFE_DEALLOCATE(kg0)
    SAFE_DEALLOCATE_P(wfnk%zk)
    
  enddo ! ika (loop over k-points sig%nkn)

  call destroy_qran() ! from vcoul_generator

!--------- End loop over rkn points (for which we calculate sigma) -------------

  
  if (peinf%inode.eq.0) then
    if (sig%freq_dep.eq.2) then
      write(6,778)
      write(7,778)
      write(8,778)
    endif
    write(6,777)
    write(7,777)
    write(8,777)
    if (sig%noffdiag.gt.0) then
      write(6,776)
      write(7,776)
      write(8,776)
    endif
  endif
778 format(/,1x,"eqp0 eigenvalue at nearest energy to quasiparticle eigenvalue", &
      /,1x,"for eqp0(omega) look at spectrum.dat")
777 format( &
      /,4x,"n = band index", &
      /,1x,"elda = energy eigenvalue", &
      /,1x,"ecor = corrected energy eigenvalue", &
      /,4x,"x = bare exchange", &
      /,3x,"sx = screened exchange at energy ecor", &
      /,3x,"ch = coulomb hole at energy ecor", &
      /,2x,"sig = sx + ch = self-energy at energy ecor", &
      /,2x,"vxc = exchange-correlation potential", &
      /,1x,"eqp0 = elda - vxc + sig(ecor)", &
      /,1x,"eqp1 = eqp0 + (dsig/de) / (1 - dsig/de) * (eqp0 - ecor)", &
      /,2x,"Znk = quasiparticle renormalization factor",/, &
      /,8x,"finite_difference_form from sigma.inp file:", &
      /,8x,"none     = -2 : dsig/de = 0 [skip the expansion]", &
      /,8x,"backward = -1 : dsig/de = (sig(ecor) - sig(ecor-de)) / de", &
      /,8x,"central  =  0 : dsig/de = (sig(ecor+de) - sig(ecor-de)) / (2*de)", &
      /,8x,"forward  =  1 : dsig/de = (sig(ecor+de) - sig(ecor)) / de", &
      /,8x,"default  =  2 : forward for diagonal and none for off-diagonal", &
      /,8x,"de is finite_difference_spacing from sigma.inp file", &
      /,8x,"elda,ecor,x,sx,ch,sig,vxc,eqp0,eqp,de are in eV", &
      /,8x,"elda and vxc both contain vxc0 so it cancels out", &
      /,8x,"eqp1 and eqp0 are Eqs. (36-37) from Hybertsen & Louie PRB 34 5390",/)
776 format( &
      4x,"n = band index of bra wavefunction", &
      /,4x,"m = band index of ket wavefunction", &
      /,4x,"l = band index of energy eigenvalue",/, &
      /,1x,"< psi_n(k) |       x       | psi_m(k) >" &
      /,1x,"< psi_n(k) | sx(ecor_l(k)) | psi_m(k) >" &
      /,1x,"< psi_n(k) | ch(ecor_l(k)) | psi_m(k) >" &
      /,1x,"< psi_n(k) |      vxc      | psi_m(k) >",/)
  
  if (peinf%inode.eq.0) then
    if (eqp1flag) then
      write(0,666)
    endif
  endif
666 format( &
      1x,"WARNING: |eqp0 - ecor| > finite_difference_spacing",/, &
      3x,"Linear extrapolation for eqp1 may be inaccurate.",/, &
      3x,"You should test the validity of eqp1 by rerunning",/, &
      3x,"calculation with self-energy evaluated at the eqp0",/, &
      3x,"energies. For that, use the eqp_outer.dat file",/, &
      3x,"created with eqp.py script and point WFN_outer to WFN_inner",/, &
      3x,"if you were not already using WFN_outer.",/)
  
  if (peinf%inode.eq.0) then
    if (imagvxcflag) write(0,677) 'Vxc'
    if (imagxflag) write(0,677) 'exchange'
  endif
677 format( &
      1x,"WARNING: ",a," diagonal matrix elements have large imaginary part.",/, &
      3x,"This may indicate a problem with your calculation.",/)
  
    
!--------- Clean Up and Finish -------------------------------------------------

  call destroy_fftw_plans()

  if (sig%spl_tck%n>0) then
    SAFE_DEALLOCATE_P(sig%spl_tck%t)
    SAFE_DEALLOCATE_P(sig%spl_tck%c)
  endif
  if (sig%spl_tck_outer%n>0) then
    SAFE_DEALLOCATE_P(sig%spl_tck_outer%t)
    SAFE_DEALLOCATE_P(sig%spl_tck_outer%c)
  endif

  SAFE_DEALLOCATE_P(kp%rk)
  SAFE_DEALLOCATE_P(wfnk%isrtk)
  SAFE_DEALLOCATE_P(wfnk%ek)
  SAFE_DEALLOCATE_P(wfnk%elda)
  if (sig%nq.gt.0) then
    SAFE_DEALLOCATE_P(sig%qpt)
  end if
  if (sig%ndiag.ne.0) then
    SAFE_DEALLOCATE_P(sig%diag)
  end if
  if (sig%noffdiag.gt.0) then
    SAFE_DEALLOCATE_P(sig%off1)
    SAFE_DEALLOCATE_P(sig%off2)
    SAFE_DEALLOCATE_P(sig%off3)
    SAFE_DEALLOCATE_P(sig%offmap)
  endif
  SAFE_DEALLOCATE_P(sig%indkn)
  if(.not. sig%use_vxcdat) then
    SAFE_DEALLOCATE_P(sig%vxc)
    call close_file(120)
  end if
  SAFE_DEALLOCATE_P(gvec%k)
  SAFE_DEALLOCATE_P(gvec%indv)
  if(sig%freq_dep.eq.1) then
    SAFE_DEALLOCATE_P(wpg%rho)
  endif
  SAFE_DEALLOCATE_P(wfnkq%isrtkq)
  SAFE_DEALLOCATE_P(wfnkq%ekin)
  SAFE_DEALLOCATE_P(wfnkq%ekq)
  SAFE_DEALLOCATE_P(peinf%indext)
  SAFE_DEALLOCATE(alda)
  SAFE_DEALLOCATE(ax)
  SAFE_DEALLOCATE(axbis)
  SAFE_DEALLOCATE(achcor)
  SAFE_DEALLOCATE(achcorbis)
  SAFE_DEALLOCATE(asig_imag)
  if (sig%freq_dep.eq.-1.or.sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
    SAFE_DEALLOCATE(asx)
    SAFE_DEALLOCATE(ach)
    SAFE_DEALLOCATE(asig)
    SAFE_DEALLOCATE(acht_n1)
    SAFE_DEALLOCATE(ach_n1q)
    SAFE_DEALLOCATE(ach_n1)
    SAFE_DEALLOCATE(asxbis)
    SAFE_DEALLOCATE(achbis)
    SAFE_DEALLOCATE(enew)
    SAFE_DEALLOCATE(efsto)
  endif
  if (sig%freq_dep.eq.2) then
    SAFE_DEALLOCATE(asxDyn)
    SAFE_DEALLOCATE(achDyn)
    SAFE_DEALLOCATE(ach2Dyn)
    SAFE_DEALLOCATE(asigDyn)
    SAFE_DEALLOCATE(achtD_n1)
    SAFE_DEALLOCATE(achD_n1q)
    SAFE_DEALLOCATE(achD_n1)
    SAFE_DEALLOCATE(asxDbis)
    SAFE_DEALLOCATE(achDbis)
    SAFE_DEALLOCATE(ach2Dbis)
    SAFE_DEALLOCATE(efstoDyn)
    SAFE_DEALLOCATE(asxtDyn)
    SAFE_DEALLOCATE(achtDyn)
    SAFE_DEALLOCATE(ach2tDyn)
  endif
  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
    SAFE_DEALLOCATE(eps)
    if (sig%iwriteint.eq.0) then
      SAFE_DEALLOCATE(epstemp)
    end if
  endif
  if (sig%freq_dep.eq.2) then
    SAFE_DEALLOCATE(epsR)
    SAFE_DEALLOCATE(epsRtemp)
#ifdef CPLX
    SAFE_DEALLOCATE(epsA)
    SAFE_DEALLOCATE(epsAtemp)
#endif
  endif
  SAFE_DEALLOCATE(isrtrq)
  if (sig%freq_dep.eq.0.or.sig%exact_ch.eq.1) then
    SAFE_DEALLOCATE(isrtrqi)
  endif
  SAFE_DEALLOCATE_P(gvec%ekin)
  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2) then
    SAFE_DEALLOCATE(isrtq)
    SAFE_DEALLOCATE(isrtqi)
    SAFE_DEALLOCATE(ind)
    SAFE_DEALLOCATE(ph)
    if(sig%iwriteint.eq.1) then
      if (peinf%inode .eq. 0) then
        SAFE_DEALLOCATE_P(epsmpi%isrtq)
        SAFE_DEALLOCATE_P(epsmpi%isrtqi)
      endif
      SAFE_DEALLOCATE_P(epsmpi%qk)
      SAFE_DEALLOCATE_P(epsmpi%nmtx)
      SAFE_DEALLOCATE_P(epsmpi%igown)
      SAFE_DEALLOCATE_P(epsmpi%iggown)
      if(sig%freq_dep.eq.0.or.sig%freq_dep.eq.1) then
        SAFE_DEALLOCATE_P(epsmpi%eps)
      else
        SAFE_DEALLOCATE_P(epsmpi%epsR)
#ifdef CPLX
        SAFE_DEALLOCATE_P(epsmpi%epsA)
#endif
      endif
      SAFE_DEALLOCATE_P(wfnkqmpi%nkptotal)
      SAFE_DEALLOCATE_P(wfnkqmpi%isort)
      SAFE_DEALLOCATE_P(wfnkqmpi%el)
      SAFE_DEALLOCATE_P(wfnkqmpi%qk)
      SAFE_DEALLOCATE_P(wfnkqmpi%band_index)
      SAFE_DEALLOCATE_P(wfnkqmpi%cg)
      if (sig%nkn.gt.1) then
        SAFE_DEALLOCATE_P(wfnkmpi%nkptotal)
        SAFE_DEALLOCATE_P(wfnkmpi%isort)
        SAFE_DEALLOCATE_P(wfnkmpi%qk)
        SAFE_DEALLOCATE_P(wfnkmpi%el)
        SAFE_DEALLOCATE_P(wfnkmpi%elda)
        SAFE_DEALLOCATE_P(wfnkmpi%cg)
      endif
    endif
  endif
  SAFE_DEALLOCATE(qq)
  if (sig%freq_dep.eq.0.or.sig%freq_dep.eq.1.or.sig%freq_dep.eq.2) then
    SAFE_DEALLOCATE(igown)
    SAFE_DEALLOCATE(iggown)
  endif
    
!----------------------------
! Time Accounting
    
  routnam(1)='TOTAL:'
  routnam(2)='INPUT:'
  routnam(3)='EPSCOPY:'
  routnam(4)='FULLBZ:'
  routnam(5)='VXC:'
  routnam(6)='SUBGRP:'
  routnam(7)='IRRBZ:'
  routnam(8)='GMAP:'
  routnam(9)='GENWF:'
  routnam(10)='MTXEL:'
  routnam(11)='SX CH:'
!  routnam(12)='SX CH (WPEFF):'
  routnam(13)='VCOUL:'
  routnam(14)='EPSREAD:'
  routnam(15)='INPUT_OUTER:'
  routnam(16)='MTXEL_CH:'
  routnam(17)='MTXEL COMM:'
  routnam(18)='BARE X:'
  routnam(19)='WF COMM:'
  routnam(20)='WF_CH COMM:'
  routnam(21)='INPUT (READ):'
  routnam(22)='INPUT (WRITE):'
  
  routsrt=(/ 1, 4, 7, 6, 8, 2,21,22,15, 9, 19,20,3,14,10,16,17,13, 5,18,11/)
  
  if(peinf%inode.eq.0) then
    call timacc(1,2,tsec)
    write(6,9000)
    do i=2,ubound(routsrt, 1)
      call timacc(routsrt(i),3,tsec,ncount)
      write(6,9001) routnam(routsrt(i)),tsec(1),tsec(2),ncount
    enddo
    write(6,*)
    call timacc(routsrt(1),3,tsec,ncount)
    write(6,9002) routnam(routsrt(1)),tsec(1),tsec(2)
    write(6,*)
9000 format(/,22x,"CPU [s]",8x,"WALL [s]",11x,"#",/)
9001 format(a16,f13.3,3x,f13.3,3x,i9)
9002 format(a16,f13.3,3x,f13.3)
  endif
  
!----------------------------
! Close files and finish

  call close_file(55) ! file sigma.inp
! call close_file(6) ! file sig.out
  if(peinf%inode == 0) then
    call close_file(7) ! file sigma.log
    call close_file(8) ! file sigma_hp.log
  endif
  if ((.not. sig%use_xdat) .and. xflag) then
    call close_file(119) ! file x.dat
  endif
  if (peinf%inode .eq. 0 .and. .not.(sig%freq_dep .eq. 0 .and. sig%exact_ch .eq. 1)  .and. .not. (sig%freq_dep == -1)) then
    call close_file(127) ! file ch_converge.dat
  endif
  if (sig%iwritecoul .eq. 1) then
    if (peinf%inode .eq. 0) then
      call close_file(19) ! file vcoul
    endif
  endif  
  if (peinf%inode .eq. 0 .and. sig%freq_dep .eq. 2) then
    write(8000, '(/,a)')'# Please refer to Sigma/README for more information about this file.'
    call close_file(8000) ! file spectrum.dat
!   call close_file(8001) ! file spectrum.plot
  endif
  if (sig%iwriteint .eq. 0) then
    if (peinf%inode.eq.0 .and. sig%freq_dep >= 0) then
      call open_file(itpe, file=fne, status='old')
      call close_file(itpe, delete = .true.)
    endif
    call open_file(itpc, file=fnc, status='old')
    call close_file(itpc, delete = .true.) ! files INT_CWFN_*
    if (mod(peinf%inode,peinf%npes/peinf%npools).eq.0) then
      call open_file(itpk, file=fnk, status='old')
      call close_file(itpk, delete = .true.) ! files INT_WFNK_*
    endif
  endif
  
  call write_memory_usage()

#ifdef MPI
  call MPI_Finalize(mpierr)
#endif

end program sigma
