!==============================================================================
!
! Routines:
!
! (1) epsilon           Originally by (MSH)      Last Edited 5/1/2008 (JRD)
!
!     Send comments/bugs to jdeslip@berkeley.edu
!
!     See README file for more details
!
!==============================================================================

#include "f_defs.h"

program epsilon

  use global_m
  use blas_m
  use extrapolar_m
  use fftw_m
  use fullbz_m
  use gmap_m
  use irrbz_m
  use mtxelmultiply_m
  use read_matrix_m
  use scalapack_m
  use vcoul_generator_m
  use write_matrix_m
  use sort_m

  implicit none

  type (kpoints) :: kp
  type (kpoints) :: kpq
  type (symmetry) :: syms
  type (gspace) :: gvec
  type (crystal) :: crys
  type (polarizability) :: pol
  type (valence_wfns) :: vwfn
  type (conduction_wfns) :: cwfn
  type (scalapack) :: scal
  type (int_wavefunction) :: intwfnv
  type (int_wavefunction) :: intwfnvq
  type (int_wavefunction) :: intwfnc

!-----------------------
! Arrays for kpoints (fullbz, ...etc)

  integer :: nind,nrq,nct,jj,idis
  integer :: indst(48)
  integer, allocatable :: indrq(:),neq(:)
  type(grid) :: gr

!-----------------------
! Arrays for polarizability matrix

  integer :: nstar,indexq0,valueq0,iflagq0,iflagq02
  integer :: npcmax,nprmax,ivin,neqmax
  integer, allocatable :: ind(:),indt(:,:,:)
  integer, allocatable :: indExtra(:), indtExtra(:,:,:) ! extrapolar index a pure G needed?
  integer, allocatable :: deltaCount(:,:)
  integer, allocatable :: deltaCountReduce(:,:)
  integer, allocatable :: nprdtemp(:),npcdtemp(:)
  integer, allocatable :: imyrowdtemp(:,:),imycoldtemp(:,:)
  real(DP) :: q0norm,q0(3),ecuts,omega_plasma
  real(DP) :: ecutsExtra !extraPolar Cutoff Energy
  real(DP), allocatable :: ekinxExtra(:)
  SCALAR :: convminval,convmaxval
  SCALAR :: gvalmin
  SCALAR :: gvalmax

  SCALAR :: phtmpmin,phtmpmax
  SCALAR, allocatable :: ph(:), convmin(:), convmax(:)
  SCALAR, allocatable :: convmin2(:), convmax2(:)
  SCALAR, allocatable :: pht(:,:,:)
  SCALAR, allocatable :: phExtra(:), phtExtra(:,:,:) ! Extrapolar phase
  integer, allocatable :: nst(:)
  SCALAR, allocatable :: chilocal(:,:)
  SCALAR, allocatable :: chilocal2(:,:,:)
  SCALAR, allocatable :: chilocalExtra(:,:)

  complex(DPC), allocatable :: chiRDyntmp(:)
  complex(DPC), allocatable :: tempvalRDyn(:)
  complex(DPC), allocatable :: chilocalRDyn(:,:,:)
  complex(DPC), allocatable :: chilocal2RDyn(:,:,:,:)
  SCALAR, allocatable :: gmetempr(:,:),gmetempc(:,:)
  SCALAR, allocatable :: gmetempn(:)
  SCALAR, allocatable :: gmetemprExtra(:,:),gmetempcExtra(:,:)
  SCALAR, allocatable :: gmetempnExtra(:)
  complex(DPC), allocatable :: gmeRDyntempn(:)
  complex(DPC), allocatable :: gmeRDyntempr(:,:,:)
  complex(DPC), allocatable :: gmeRDyntempr2(:)
  complex(DPC), allocatable :: gmeRDyntempc(:,:)
  complex(DPC), allocatable :: edenDRtemp(:)
#ifdef CPLX
  complex(DPC), allocatable :: chiADyntmp(:)
  complex(DPC), allocatable :: tempvalADyn(:)
  complex(DPC), allocatable :: chilocalADyn(:,:,:)
  complex(DPC), allocatable :: chilocal2ADyn(:,:,:,:)
  complex(DPC), allocatable :: gmeADyntempn(:)
  complex(DPC), allocatable :: gmeADyntempr(:,:,:)
  complex(DPC), allocatable :: gmeADyntempr2(:)
  complex(DPC), allocatable :: gmeADyntempc(:,:)
  complex(DPC), allocatable :: edenDAtemp(:)
#endif

  SCALAR, allocatable :: convmaxtotal(:), convmintotal(:)
  character :: aheadinput*60,ajname*6,tmpstr*100,adate*11,atime*14
  character :: tmpfn*16,filename*13
  integer :: initial_access = 0
  integer :: i,j,k,n,irq,iv,ig,itran,it
  integer :: ncount,ix,jx,kgq(3)
  integer :: np,itape,iq,icurr,ntot,itot,deltaCountI
  integer :: ik, ij, iown, isend
  integer :: kgg(3) ! extrapolar variables
  integer :: ipe, ntot2, ijkfreq, nmtx_t
  integer, allocatable :: iowna(:)
  real(DP) :: tsec(2)
  real(DP) :: fact,qk(3),rq(3)
  integer :: ispin,icol,irow,icolm,irowm, nqnonzero
  character*24 :: routnam(33),strhead,strtail,countFile
  integer :: routsrt(32)
  integer :: ivr, ilimit, itpv, itpc
  integer, allocatable :: icolv(:), icolmv(:)
  integer, allocatable :: irowv(:), irowmv(:)
  integer, allocatable :: icolmvg(:), irowmvg(:)
  real(DP) :: zvalue

  logical :: skip_checkbz

!--------------- Begin Program ---------------------------------------

  call peinfo_init()

!----------------------
! Initialize random numbers

  peinf%jobtypeeval = 0

!--------------------
! Initialize timer

  if(peinf%inode .eq. 0) then
    call timacc(1,0,tsec)
    call timacc(1,1,tsec)
  endif

!------------------------
! Initialize files

  if(peinf%inode .eq. 0) call timacc(19,1,tsec)
  
  call open_file(55,file='epsilon.inp',form='formatted',status='old')
  if(peinf%inode .eq. 0) then
    call open_file(7,file='epsilon.log',form='formatted',status='replace')
  endif
  
  call write_program_header('Epsilon', .true.)

!----------- Call Input: Read crystal data from unit 25 ---------------

! read parameters and q-points from unit 55 (input file)
! initialize unit 10 (output for polarizability matrix)

  write(aheadinput,'(60x)')
  write(ajname,'("chiGG0")')
  call date_time(adate,atime)

  if(peinf%inode .eq. 0) call timacc(19,2,tsec)
  
  if(peinf%inode .eq. 0) call timacc(2,1,tsec)
  call input(aheadinput,ajname,adate,kp,crys,syms,gvec, &
    ecuts,ecutsExtra,pol,indexq0,valueq0,q0norm,cwfn,vwfn, &
    intwfnv,intwfnc,omega_plasma)

  if(.not. pol%skip_chi .and. peinf%inode == 0) then
    call open_file(17,file='chi_converge.dat',form='formatted',status='replace')
  endif

  if (pol%iwritecoul .eq. 1) then
    if (peinf%inode .eq. 0) then
      call open_file(19,file='vcoul',form='formatted',status='replace')
    endif
  endif

! PWD: don`t like this but unclear where to do this
! DAS: I know where: put it in inread.
  if(pol%extraPolarMethod.eq.1.AND.ecutsExtra.eq.0.0d0) ecutsExtra = ecuts * 4

! CHP: saves the (G=0,G`=0) component of (retarded) epsilon inverse
  if(peinf%inode .eq. 0 .and. pol%freq_dep .eq. 2) then
    call open_file(51,file='EpsInvDyn',form='formatted',status='replace')
    call open_file(52,file='EpsDyn',form='formatted',status='replace')
  endif

  SAFE_ALLOCATE(vwfn%isort, (gvec%ng))
  SAFE_ALLOCATE(cwfn%isort, (gvec%ng))
  SAFE_ALLOCATE(cwfn%ekin, (gvec%ng))

  if (indexq0 .ne. 0) then
    if (pol%icutv .eq. 0 .and. q0norm .lt. TOL_small) then
      call die('no truncation and zero q0')
    endif
  endif
  
  if(peinf%inode .eq. 0) call timacc(2,2,tsec)


!-------------- Read wavefunctions for (k+q) points ---------------------

! SIB:  The input_q routine looks the same as the input routine but
! if reads from a file called WFNq instead of WFN.  Presumably
! these are the shifted (by "q") wave functions.

  if(peinf%inode .eq. 0) call timacc(3,1,tsec)
  if(indexq0 .gt. 0 .and. valueq0 .eq. 1 .and. pol%iqexactlyzero .eq. 0) then
    if (peinf%inode .eq. 0) then
      write(6,*) 'You have a slightly shifted q0 vector and a semiconductor.'
      write(6,*) 'So reading from WFNq.'
    endif
    call input_q(gvec,kpq,cwfn,vwfn,pol,intwfnvq)
  endif
  if(peinf%inode .eq. 0) call timacc(3,2,tsec)


!-------------- GENERATE FULL BZ ----------------------------------------

! SIB:  fullbz() takes the kpoints in kp%k(1:3,kp%nrk) and applies all
! the symmetries in syms to them.  The resulting set of unique vectors
! are in gr%f(1:3,gr%nf) (gr%nf of them).

  if(peinf%inode .eq. 0) call timacc(4,1,tsec)

#ifdef VERBOSE
  if (peinf%inode .eq. 0) then
    write(6,*)
    write(6,*) 'Calling fullbz to generate full zone from'
    write(6,*) 'syms%ntran = ',syms%ntran
    write(6,*) 'kp%nrk = ',kp%nrk
    do iq=1,kp%nrk
      write(6,'(3f10.4)') kp%rk(:,iq)
    enddo
    write(6,*)
  endif
#endif
  gr%nr = kp%nrk
  SAFE_ALLOCATE(gr%r, (3, gr%nr))
  gr%r = kp%rk
  call fullbz(crys,syms,gr,syms%ntran,skip_checkbz,wigner_seitz=.false.,paranoid=.true.)
  tmpfn='WFN'
  if (.not. skip_checkbz) then
    call checkbz(gr%nf,gr%f,kp%kgrid,kp%shift,crys%bdot, &
      tmpfn,'k',.false.,pol%freplacebz,pol%fwritebz)
  endif
  if(peinf%inode .eq. 0) then
    write(6,*) 'nfk=',gr%nf
#ifdef VERBOSE
    do iq=1,gr%nf
      write(6,'(3f10.4)') gr%f(:,iq)
    enddo
#endif
  endif

  SAFE_ALLOCATE(gvec%ekin, (gvec%ng))
  
  if(pol%extraPolarMethod.eq.1) then
    SAFE_ALLOCATE(ekinxExtra, (gvec%ng))
  endif
  
  SAFE_ALLOCATE(scal%nprd, (peinf%npes))
  SAFE_ALLOCATE(scal%npcd, (peinf%npes))
  
  if(peinf%inode .eq. 0) call timacc(4,2,tsec)


!----------- LOOP over q points for which chi and eps are calculated -----

  do iq=1,pol%nq
    
    if(peinf%inode.eq.0) call timacc(20,1,tsec)
    
! SIB:  q0(1:3) is the current q vector under consideration

    q0(:)=pol%qpt(:,iq)
    if(peinf%inode.eq.0) then
      write(6,'(/,/,"---------------------------------")')
    endif
    if(peinf%inode.eq.0)  write(6,9345) (q0(i),i=1,3)
9345 format(' Dealing now with  ',3f10.5,/)

! SIB:  flagq0 is 1 if the q vector is the "zero" q-vector.

    iflagq0=0
    if(indexq0.eq.iq.and.valueq0.eq.1) iflagq0=1
    iflagq02=0
    if(indexq0.eq.iq) iflagq02=1

!--------------------
! Determine number of matrix elements
!
! SIB:  for each g-vector, qk=g+q0 and gvec%ekin=qk`*bdot*qk
! (square length of qk)

    do i=1,gvec%ng
      if (iflagq02.eq.1) then
        qk(:)=gvec%k(:,i)
      else
        qk(:)=gvec%k(:,i)+q0(:)
      endif
      gvec%ekin(i)=DOT_PRODUCT(qk,MATMUL(crys%bdot,qk))
    enddo

! PWD:  So we probably need to do the same for the extrapolar term
! which is just G based 
    if(pol%extraPolarMethod.eq.1) then
      do i=1,gvec%ng
        qk(:)=gvec%k(:,i)
        ekinxExtra(i)=DOT_PRODUCT(qk,MATMUL(crys%bdot,qk))
      enddo
    endif

!--------------------
! Sort kinetic energies
! index of ordered kinetic energies in array isrtx
!
! SIB: pol%isrtx has the indices for sorted gvec%ekin

    SAFE_ALLOCATE(pol%isrtx, (gvec%ng))

    if(peinf%inode.eq.0) call timacc(20,2,tsec)
    
    if(peinf%inode.eq.0) call timacc(5,1,tsec)
#ifdef VERBOSE
    call logit('sorting gvec')
#endif
    call sortrx_D(gvec%ng, gvec%ekin, pol%isrtx, gvec = gvec%k)
    if(peinf%inode.eq.0) call timacc(5,2,tsec)

! PWD: pol%isrtx has the indices sorted for the straight G gvec%ekin
    if(pol%extraPolarMethod.eq.1) then
      SAFE_ALLOCATE(pol%isrtxExtra,(gvec%ng))
      call logit('sorting gvec Extra')
      call sortrx_D(gvec%ng, ekinxExtra, pol%isrtxExtra, gvec = gvec%k)
    endif

!---------------------
! Compute inverse array to isrtx

    SAFE_ALLOCATE(pol%isrtxi, (gvec%ng))
    do i=1,gvec%ng
      pol%isrtxi(pol%isrtx(i))=i
    enddo

    if(pol%extraPolarMethod.eq.1) then
      SAFE_ALLOCATE(pol%isrtxiExtra, (gvec%ng))
      do i=1,gvec%ng
        pol%isrtxiExtra(pol%isrtxExtra(i))=i
      enddo
    endif

! SIB:  pol%nmtx becomes the number of matrix elements to be computed;
! the matrix is computed if its gvec%ekin is < ecuts

    if(peinf%inode.eq.0) call timacc(21,1,tsec)

    pol%nmtx = gcutoff(gvec, pol%isrtx, ecuts)
    if(pol%extraPolarMethod == 1) &
      pol%nGGpnmtx = gcutoff(gvec, pol%isrtxExtra, ecutsExtra)

    if(peinf%inode.eq.0.and.pol%extraPolarMethod.eq.1) then
      write(6,*) 'pol%nGGpnmtx=', pol%nGGpnmtx
    endif
        
! PWD: extrapolar G-G` number of elements

    if(peinf%inode.eq.0) call timacc(21,2,tsec)
    if(peinf%inode.eq.0) call timacc(22,1,tsec)

    SAFE_ALLOCATE(pol%irow, (pol%nmtx))
    pol%irow=0

! JRD:  Determine size of distributed matrices

    SAFE_ALLOCATE(irowv, (pol%nmtx))
    SAFE_ALLOCATE(icolv, (pol%nmtx))

    call blacs_setup(scal, pol%nmtx, .true.)

#ifdef USESCALAPACK
#ifdef VERBOSE
    call logit('Initializing scaLAPACK')
    if (peinf%inode .eq. 0) then
      write(6,*) ' '
    endif
#endif
    do ig=1,pol%nmtx
      irowv(ig)=MOD(INT(((ig-1)/scal%nbl)+TOL_SMALL),scal%nprow)
      icolv(ig)=MOD(INT(((ig-1)/scal%nbl)+TOL_SMALL),scal%npcol)
    enddo
#else
    do ig=1,pol%nmtx
      irowv(ig)=0
      icolv(ig)=0
    enddo
#endif

#ifdef MPI
    SAFE_ALLOCATE(nprdtemp, (peinf%npes))
    SAFE_ALLOCATE(npcdtemp, (peinf%npes))
    
    scal%nprd = 0
    scal%npcd = 0
    nprdtemp = 0
    npcdtemp = 0
    nprdtemp(peinf%inode + 1) = scal%npr
    npcdtemp(peinf%inode + 1) = scal%npc

    call MPI_ALLREDUCE(nprdtemp,scal%nprd,peinf%npes,MPI_INTEGER,MPI_SUM, MPI_COMM_WORLD,mpierr)
    call MPI_ALLREDUCE(npcdtemp,scal%npcd,peinf%npes,MPI_INTEGER,MPI_SUM, MPI_COMM_WORLD,mpierr)

    SAFE_DEALLOCATE(nprdtemp)
    SAFE_DEALLOCATE(npcdtemp)
#else
    scal%nprd = scal%npr
    scal%npcd = scal%npc
#endif

    SAFE_ALLOCATE(irowmv, (scal%npr*scal%npc))
    SAFE_ALLOCATE(icolmv, (scal%npr*scal%npc))
    SAFE_ALLOCATE(irowmvg, (scal%npr*scal%npc))
    SAFE_ALLOCATE(icolmvg, (scal%npr*scal%npc))
    
    icurr=0
    do ig = 1, scal%npr*scal%npc
      icurr=icurr+1
      irowmv(icurr)=INT((icurr-1)/scal%npc+TOL_SMALL)+1
      icolmv(icurr)=MOD((icurr-1),scal%npc)+1
    enddo
    
!------------------------------
! JRD: Create isrtxcol isrtxrow

    SAFE_ALLOCATE(scal%isrtxcol, (scal%npc))
    SAFE_ALLOCATE(scal%isrtxrow, (scal%npr))
    SAFE_ALLOCATE(scal%imycol, (scal%npc))
    SAFE_ALLOCATE(scal%imyrow, (scal%npr))

#ifdef MPI
    call MPI_ALLREDUCE(scal%npc,npcmax,1,MPI_INTEGER,MPI_MAX,MPI_COMM_WORLD,mpierr)
    call MPI_ALLREDUCE(scal%npr,nprmax,1,MPI_INTEGER,MPI_MAX,MPI_COMM_WORLD,mpierr)
#else
    npcmax = scal%npc
    nprmax = scal%npr
#endif

!        if (peinf%inode .eq. 0) write(6,*) "npcmax,nprmax",npcmax,nprmax,scal%npc,scal%npr
!        write(6,*) 'Allocating imyrowd', nprmax,npcmax,peinf%npes

    SAFE_ALLOCATE(scal%imycold, (npcmax,peinf%npes))
    SAFE_ALLOCATE(scal%imyrowd, (nprmax,peinf%npes))
    SAFE_ALLOCATE(imycoldtemp, (npcmax,peinf%npes))
    SAFE_ALLOCATE(imyrowdtemp, (nprmax,peinf%npes))
    scal%imycold = 0
    scal%imyrowd = 0
    imycoldtemp = 0
    imyrowdtemp = 0

    icurr=0
    
    do ij = 1, pol%nmtx
      irow=MOD(INT(((ij-1)/scal%nbl)+TOL_SMALL),scal%nprow)
      if (irow .eq. scal%myprow) then
        do ik = 1, pol%nmtx
          icol=MOD(INT(((ik-1)/scal%nbl)+TOL_SMALL),scal%npcol)
          if(icol .eq. scal%mypcol) then
            icurr=icurr+1
            irowm=INT((icurr-1)/scal%npc+TOL_SMALL)+1
            icolm=MOD((icurr-1),scal%npc)+1
            icolmvg(icurr)=ik
            irowmvg(icurr)=ij
            scal%isrtxrow(irowm)=pol%isrtx(ij)
            scal%isrtxcol(icolm)=pol%isrtx(ik)
            scal%imyrow(irowm)=ij
            scal%imycol(icolm)=ik
            imyrowdtemp(irowm,peinf%inode+1)=ij
            imycoldtemp(icolm,peinf%inode+1)=ik
          endif
        enddo
      endif
    enddo

#ifdef MPI
    call MPI_ALLREDUCE(imyrowdtemp(1,1),scal%imyrowd(1,1),nprmax*peinf%npes,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr)
    call MPI_ALLREDUCE(imycoldtemp(1,1),scal%imycold(1,1),npcmax*peinf%npes,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr)
#else
    scal%imyrowd = imyrowdtemp
    scal%imycold = imycoldtemp
#endif

    SAFE_DEALLOCATE(imycoldtemp)
    SAFE_DEALLOCATE(imyrowdtemp)

    if(peinf%inode.eq.0) call timacc(22,2,tsec)

    if(peinf%inode.eq.0) call timacc(23,1,tsec)

!----------------------
! Determine subgroup which leaves q0 invariant
!
! SIB:  figures out which symmetries acting on q0 result in q0 + integer
! entries.  syms%ntranq is their number, syms%indsub are their indices
! (pointing to syms%mtrx), and syms%kgzero(1:3,:) are the integer entries.

    if(peinf%inode.eq.0) call timacc(6,1,tsec)
    call subgrp(q0,syms)
    if(peinf%inode.eq.0) call timacc(6,2,tsec)

!-----------------------
! Determine independent elements of polarizability matrix
!
! SIB:  independent means due to symmetries.  This initializes
! the polarizability matrix pol%chi to zero (for independent entries)
! and figure out phases due to symmetries for dependent ones,
! and points dependent ones to the entries they depend on (pol%kxi indices)

    nind = 0

! JRD: In parallel version we don`t do this anymore

!        if(peinf%inode.eq.0) call timacc(7,1,tsec)
!#ifdef VERBOSE
!        call logit('calling indep')
!#endif
!        call indep(nind,gvec,syms,pol,kp%nspin)
!
! JRD: Testing what if we set pol%kxi to zero
!        pol%kxi = 0
!        pol%chi = 0D0
!        nind=pol%nmtx*(pol%nmtx+1)/2
!
!        if(peinf%inode.eq.0) call timacc(7,2,tsec)

!----------------------
! Reduce the k-points to the irr. bz with respect to q
!
! SIB:  figure out k-points in irr. BZ (based on symmetries for current q)
! nrq is # of irr. points, indrq are their indices in the full zone,
! and neq is the number of equiv. points for an irr. point.
! (Full zone vectors come in through gr%f(1:3,1:gr%nf).)

    if(peinf%inode.eq.0) call timacc(8,1,tsec)
    SAFE_ALLOCATE(indrq, (gr%nf))
    SAFE_ALLOCATE(neq, (gr%nf))
    call irrbz(syms,gr%nf,gr%f,nrq,neq,indrq)
    if(peinf%inode.eq.0) call timacc(8,2,tsec)

    neqmax = maxval(neq(1:nrq))

!        write(6,*) peinf%inode, 'neqmax', neq(1), neqmax

!---------------------------
! Output points in irr. bz

    if(peinf%inode.eq.0) then
      write(6,80) (q0(i),i=1,3),pol%nmtx,nind,nrq
80    format(3x,'q =',3f8.4,3x,'nmtx =',i8,5x,'nind =',i8, &
        /,/,5x,i4,1x,'k-points in irreducible bz(q)', &
        /,/,15x,'rq',16x,'neq')
      do j=1,nrq
        write(6,60) (gr%f(i,indrq(j)),i=1,3),neq(j)
60      format(3f10.5,i5)
      enddo
      write(7,70) (q0(i),i=1,3),pol%nmtx,nind,nrq
70    format(/ /,5x,'q=',3f7.4,2x,'nmtx=',i8,2x,'nind=',i8,2x,'nrq=',i3)
    endif

    fact=2.0d0/(dble(gr%nf)*crys%celvol)
    if(kp%nspin.eq.1) then
      fact=4.0d0/(dble(gr%nf)*crys%celvol) ! 4 spin components
    endif

    if (pol%freq_dep .eq. 0) then
      SAFE_ALLOCATE(pol%chi, (scal%npr,scal%npc,kp%nspin))
      pol%chi=ZERO
    endif

    if (pol%freq_dep .eq. 2) then
      SAFE_ALLOCATE(pol%chiRDyn, (pol%nFreq,scal%npr,scal%npc,kp%nspin))
      pol%chiRDyn=(0.0,0.0)
#ifdef CPLX
      SAFE_ALLOCATE(pol%chiADyn, (pol%nFreq,scal%npr,scal%npc,kp%nspin))
      pol%chiADyn=(0.0,0.0)
#endif
    endif

    if (.not. pol%skip_chi) then

!------------------------------------
! SIB:  allocate space

!        write(6,*) peinf%inode, 'allocating pht', neqmax, pol%nmtx, nrq

! JRD: Arrays for convergence tests

      if (pol%freq_dep .eq. 0) then
        SAFE_ALLOCATE(convmin, (cwfn%nband - vwfn%nband))
        SAFE_ALLOCATE(convmax, (cwfn%nband - vwfn%nband))
        SAFE_ALLOCATE(convmin2, (cwfn%nband - vwfn%nband))
        SAFE_ALLOCATE(convmax2, (cwfn%nband - vwfn%nband))
      endif
    
      if (peinf%inode .eq. 0 .and. pol%freq_dep .eq. 0) then
        write(6,*) 'Allocating Convergence Arrays', cwfn%nband-vwfn%nband
        convmin=0d0
        convmax=0d0
      endif

      SAFE_ALLOCATE(ind, (pol%nmtx))
      SAFE_ALLOCATE(ph, (pol%nmtx))
      SAFE_ALLOCATE(pht, (pol%nmtx,neqmax,nrq))
      SAFE_ALLOCATE(indt, (pol%nmtx,neqmax,nrq))
      if(pol%extraPolarMethod.eq.1) then
        SAFE_ALLOCATE(deltaCount, (pol%nmtx,pol%nmtx))
        SAFE_ALLOCATE(deltaCountReduce, (pol%nmtx,pol%nmtx))
        deltaCount = 0
        deltaCountReduce = 0
        SAFE_ALLOCATE(indExtra, (pol%nGGpnmtx))
        SAFE_ALLOCATE(phExtra, (pol%nGGpnmtx))
        SAFE_ALLOCATE(phtExtra, (pol%nGGpnmtx,neqmax,nrq))
        SAFE_ALLOCATE(indtExtra, (pol%nGGpnmtx,neqmax,nrq))
      endif
      ind=0

      SAFE_ALLOCATE(nst, (nrq))

! JRD: Possible Memory Hazard.  We can speed this up by possibly
! only allocating number of bands on current proc and doing send/recvs

      SAFE_ALLOCATE(pol%gme, (pol%nmtx,peinf%ncownt,peinf%nvownt,kp%nspin,nrq))
      if(pol%extraPolarMethod.eq.1) then
        SAFE_ALLOCATE(pol%gmeExtra, (pol%nmtx,peinf%ncownt,peinf%nvownt,kp%nspin,nrq))
      endif
      SAFE_ALLOCATE(iowna, (vwfn%nband+pol%ncrit))
    
! PWD: the extra extrapolar matrix elements
      if(pol%extraPolarMethod.eq.1) then
        SAFE_ALLOCATE(pol%gpgExtraPolar, (vwfn%nband+pol%ncrit,pol%nGGpnmtx,kp%nspin,nrq))
        SAFE_ALLOCATE(gmetempnExtra, (pol%nmtx))
      endif

      if (pol%freq_dep .eq. 0) then
        SAFE_ALLOCATE(gmetempn, (pol%nmtx))
      endif

      if (pol%freq_dep .eq. 2) then
        SAFE_ALLOCATE(pol%edenDyn, (peinf%nvownt,peinf%ncownt,kp%nspin,nrq))
        SAFE_ALLOCATE(edenDRtemp, (pol%nFreq))
        SAFE_ALLOCATE(gmeRDyntempn, (pol%nmtx))
        SAFE_ALLOCATE(chiRDyntmp, (pol%nFreq))
        SAFE_ALLOCATE(tempvalRDyn, (pol%nFreq))
#ifdef CPLX
        SAFE_ALLOCATE(edenDAtemp, (pol%nFreq))
        SAFE_ALLOCATE(gmeADyntempn, (pol%nmtx))
        SAFE_ALLOCATE(chiADyntmp, (pol%nFreq))
        SAFE_ALLOCATE(tempvalADyn, (pol%nFreq))
#endif
      endif

      if(peinf%inode.eq.0) call timacc(23,2,tsec)

!--------- LOOP OVER K-POINTS IN SET RQ ---------------------------------

! SIB:  loop over points in irreducible zone (why it`s labeled with q
! instead of k is a mystery)

      do irq=1,nrq
        
        if (pol%freq_dep .eq. 0) then
          SAFE_ALLOCATE(pol%eden, (vwfn%nband+pol%ncrit,cwfn%nband,kp%nspin))
          
          if (pol%extraPolarMethod.eq.1) then
            SAFE_ALLOCATE(pol%edenExtraPolar,(vwfn%nband+pol%ncrit,kp%nspin))
          endif
        endif
        
        rq(:)=gr%f(:,indrq(irq)) ! rq(:) is the current irr. k-point

! Regenerate star of rq,store the index of the rotation
! SIB:  Star is the set of vectors generated by applying all
! subgroup operations for the current q-vector to the k-point rq.

        if(peinf%inode.eq.0) call timacc(11,1,tsec)
        call rqstar(syms,nstar,indst,rq)
        if(nstar.ne.neq(irq)) then
          write(0,*) 'nstar?',irq,nstar,neq(irq)
          call die('nstar mismatch')
        endif
        if(peinf%inode.eq.0) call timacc(11,2,tsec)

! JRD: loop over transfs which generate the star of rq for gmap

        nst(irq) = nstar
        if(peinf%inode.eq.0) call timacc(12,1,tsec)
        do it=1,nstar

! Map g-vectors in polarizability to r**(-1)(g-gq)
! note that gmap requires index of transf in full group
! whereas indst gives index in subgroup

          itran = syms%indsub(indst(it))
          kgq(:) = -syms%kgzero(:,indst(it)) ! note minus sign
          call gmap(gvec,syms,pol%nmtx,itran,kgq,pol%isrtx,pol%isrtxi,ind,ph,.true.)

          pht(:,it,irq) = ph(:)
          indt(:,it,irq) = ind(:)

          if(pol%extraPolarMethod.eq.1) then
! PWD: seems that this should be different for the Diagonal
! G`-G diagonal extrapolar element but how?
! should kgq be zero?
            kgg(:) = 0
            call gmap(gvec,syms,pol%nGGpnmtx,itran, kgg,pol%isrtxExtra,pol%isrtxiExtra,indExtra,phExtra,.true.)
            phtExtra(:,it,irq) = phExtra(:)
            indtExtra(:,it,irq) = indExtra(:)
          endif

! debug Statement here
        enddo
        if(peinf%inode.eq.0) call timacc(12,2,tsec)


!--------- loop over occupied states -------------------------------------

! SIB:  loop over valence states (iv,ispin) where iv is the band index.

        pol%gme(:,:,:,:,irq)=0D0
        do iv=1,peinf%nvown

          if (peinf%inode .eq. 0) write(6,*) 'Doing iv', iv,'of', peinf%nvown
#ifdef VERBOSE
          call logit('calling genwf')
#endif
          if(peinf%inode.eq.0) call timacc(9,1,tsec)
          if (pol%iwriteint .eq. 0) then
            if (iv .le. peinf%nvownt) then
              ivr=peinf%invindexv(iv)
            else
              ivr=-1
            endif
            call genwf_disk(syms,gvec,crys,kp,kpq,irq,rq,q0,vwfn,pol,cwfn,iflagq0,ivr)
          else
            if (iv .le. peinf%nvownt) then
              call genwf_mpi(syms,gvec,crys,kp,kpq,irq,rq,q0,vwfn,pol,cwfn,iflagq0,intwfnv,intwfnvq,intwfnc,iv)
            endif
          endif
          if(peinf%inode.eq.0) call timacc(9,2,tsec)

! SIB:  compute matrix elements and energy denominators for (iv,ispin)
! with all other conduction bands.

          do ispin=1,kp%nspin

#ifdef VERBOSE
            call logit('done genwf')
            write(tmpstr,'(a,i2,a,i4,a)') "is =", ispin, " iv = ", iv, " calling mtxel" 
            call logit(tmpstr)
#endif

            if ( iv .le. peinf%nvownt) then
              if(peinf%inode.eq.0) call timacc(10,1,tsec)
              ivin=peinf%invindexv(iv)
!                  write(6,*) peinf%inode,'Call MTX',iv,ivin
!                  if(peinf%inode.eq.0) then
!                    write(6,*) ',cwfn%ec(cwfn%nband), cwfn%ecs', cwfn%ec(cwfn%nband,ispin), cwfn%ecs
!                  endif
                  
              call mtxel(ivin,gvec,vwfn,cwfn,pol,ispin,irq)

! PWD: compute the extrapolar diagonal occupied term
! this is going to get called more than necessary across
! all cores but it is comparatively cheap
              if (pol%extraPolarMethod.eq.1) then
                call mtxelGpGExtraPolar(ivin,gvec,vwfn,cwfn,pol,ispin,irq)
              endif

              if(peinf%inode.eq.0) call timacc(10,2,tsec)
            endif

          enddo ! ispin

          if ( iv .le. peinf%nvownt) then
            SAFE_DEALLOCATE_P(vwfn%zv)
          endif

        enddo ! iv
        if (peinf%nvownt>0) then
          SAFE_DEALLOCATE_P(cwfn%zc)
          SAFE_DEALLOCATE_P(vwfn%ev)
          SAFE_DEALLOCATE_P(cwfn%ec)
        endif
        if (pol%freq_dep .eq. 0) then
          SAFE_DEALLOCATE_P(pol%eden)
          if(pol%extraPolarMethod.eq.1) then
            SAFE_DEALLOCATE_P(pol%edenExtraPolar)
          endif
        endif
        
      enddo ! irq

!---------------------------------------------------------------------
! JRD: Create Convergence Tests

! Loop over matrix elements in chi

! JRD: Construct convergence test for G=G`=0 and G=G`=pol%nmtx
! This will print chi(0,0), chi(Gmax,Gmax) as a function of conduction
! bands later on.

! PWD:  this needs to be worked on for the extrapolar case
!       since we`ve already applied part of the correction to pol%gme
!       this is going to be quite messed up.

      if(peinf%inode.eq.0) call timacc(24,1,tsec)

#ifdef VERBOSE
      if (peinf%inode .eq. 0) write(6,'(/,1x,"Starting Convergence Tests")')
#endif

      if (pol%freq_dep .eq. 0) then
        
        convmin = 0D0
        convmin2 = 0D0
        convmax = 0D0
        convmax2 = 0D0
        
        do ispin =1, kp%nspin
          do irq = 1, nrq
            do it = 1, nst(irq)
              
              phtmpmin=pht(1,it,irq)*MYCONJG(pht(1,it,irq))
              phtmpmax=pht(pol%nmtx,it,irq)*MYCONJG(pht(pol%nmtx,it,irq))
              
              do iv=1,(vwfn%nband+pol%ncrit)
                iown =1 
                do j=1,cwfn%nband-vwfn%nband
                  isend=peinf%global_pairowner(iv,j)-1
                  if (isend .eq. peinf%inode) then
                    if (iown .gt. peinf%ncownt) write(6,*) 'iown bigger than ncownt'
                    gvalmin = pol%gme(indt(1,it,irq),iown,peinf%indexv(iv), &
                      ispin,irq)
                    gvalmax = pol%gme(indt(pol%nmtx,it,irq),iown,peinf%indexv(iv), &
                      ispin,irq)
                    convmin2(j) = convmin2(j) + gvalmin*MYCONJG(gvalmin)*phtmpmin*fact*(-1D0)
                    convmax2(j) = convmax2(j) + gvalmax*MYCONJG(gvalmax)*phtmpmax*fact*(-1D0)
                    iown=iown+1
                  endif
                enddo ! j
              enddo ! iv
              
            enddo ! it
          enddo ! irq
        enddo ! ispin

#ifdef MPI
        call MPI_reduce(convmin2,convmin,cwfn%nband-vwfn%nband,MPI_SCALAR,MPI_SUM,0,MPI_COMM_WORLD,mpierr)
        call MPI_reduce(convmax2,convmax,cwfn%nband-vwfn%nband,MPI_SCALAR,MPI_SUM,0,MPI_COMM_WORLD,mpierr)
#else
        convmin = convmin2
        convmax = convmax2
#endif

        if(peinf%inode.eq.0) call timacc(24,2,tsec)

      endif ! pol%freq_dep .eq. 0

!-----------------------------------------------------------------------
! Consruct part of chi that this proc owns

#ifdef VERBOSE
      if (peinf%inode .eq. 0) write(6,'(/,1x,"Doing chi Summation")')
#endif
      if (peinf%inode.eq.0) call timacc(15,1,tsec)

! GCOMM - ELEMENTS

      if (pol%gcomm .eq. 0) then

        if (pol%freq_dep .eq. 0) then
          SAFE_ALLOCATE(chilocal, (scal%npr,scal%npc))
        endif

        if (pol%freq_dep .eq. 2) then
          SAFE_ALLOCATE(chilocalRDyn, (scal%npr,scal%npc,pol%nfreq))
#ifdef CPLX
          SAFE_ALLOCATE(chilocalADyn, (scal%npr,scal%npc,pol%nfreq))
#endif
        endif

#ifdef VERBOSE
        call logit("Starting chi Sum")
#endif

        do irq=1,nrq

! CHP : It is painful to wait hours until we see the message
!       'Done Polarizability'. We need to know where we are!

          if(peinf%inode.eq.0) then
            write(6,'(A,i8,6x,A,i8,A)') '### irq=',irq,'(nrq=',nrq,')'
          endif
#ifdef MPI
          call MPI_barrier(MPI_COMM_WORLD,mpierr)
#endif

          do ispin=1,kp%nspin

            iowna(:)=1
            
            ntot=(vwfn%nband+pol%ncrit)*nst(irq)
            if (pol%freq_dep .eq. 0) then
              chilocal=0
              SAFE_ALLOCATE(gmetempr, (scal%npr,ntot))
              SAFE_ALLOCATE(gmetempc, (ntot,scal%npc))
            endif
            if (pol%freq_dep .eq. 2) then
              chilocalRDyn=0
              SAFE_ALLOCATE(gmeRDyntempr, (scal%npr,ntot,pol%nfreq))
              SAFE_ALLOCATE(gmeRDyntempc, (ntot,scal%npc))
#ifdef CPLX
              chilocalADyn=0
              SAFE_ALLOCATE(gmeADyntempr, (scal%npr,ntot,pol%nfreq))
              SAFE_ALLOCATE(gmeADyntempc, (ntot,scal%npc))
#endif
            endif

            do j=1,cwfn%nband-vwfn%nband

! We do two giant loops here for freq_dep cases

              if (pol%freq_dep .eq. 0) then
                
                itot=0
                if (peinf%inode.eq.0) call timacc(14,1,tsec)
                do iv=1,(vwfn%nband+pol%ncrit)
                  isend=peinf%global_pairowner(iv,j)-1
                  if (isend .lt. 0) write(0,*) 'Illegal value for mpi proc, isend:',peinf%inode,iv,j,peinf%global_pairowner(iv,j)
                  if (isend .eq. peinf%inode) then
                    if (iowna(iv) .gt. peinf%ncownt) write(6,*) 'iowna(iv) bigger than ncownt'
                    gmetempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv), &
                      ispin,irq) * sqrt(fact)
                    iowna(iv)=iowna(iv)+1
                  endif
#ifdef MPI
                  call MPI_Bcast(gmetempn,pol%nmtx,MPI_SCALAR,isend,MPI_COMM_WORLD,mpierr)
#endif
                  if (scal%npr*scal%npc .ne. 0) then

                    do it =1, nst(irq)
                      itot = itot + 1
                    
                      do icurr=1,scal%npr
                        gmetempr(icurr,itot)=gmetempn(indt(scal%imyrow(icurr),it,irq)) * &
                          pht(scal%imyrow(icurr),it,irq)
                      enddo

                      do icurr=1,scal%npc
                        gmetempc(itot,icurr)=MYCONJG(gmetempn(indt(scal%imycol(icurr),it,irq))*pht(scal%imycol(icurr),it,irq))
                      enddo
                    enddo ! it
                  endif
                enddo ! iv
                if (peinf%inode.eq.0) call timacc(14,2,tsec)

! JRD: Using Level3 BLAS here for better performance

                if (peinf%inode.eq.0) call timacc(30,1,tsec)

                if (scal%npr*scal%npc .ne. 0) then
                  call X(GEMM)('n','n',scal%npr,scal%npc,ntot, &
                   -ONE,gmetempr,scal%npr,gmetempc,ntot,ONE,chilocal,scal%npr)
                endif

! JRD: Slower Level2 BLAS

!                    do itot=1,ntot
!                      call ZGERU(scal%npr,scal%npc,-1D0,gmetempr(:,itot), &
!                        1,gmetempc(itot,:),1,chilocal,scal%npr)
!                    enddo

! Significantly Slower Manual Outer Product

!                    do itot=1,ntot
!                      curr=0
!                      do icolm = 1,scal%npc
!                        do irowm = 1,scal%npr
!                          chilocal(irowm,icolm) = chilocal(irowm,icolm) - &
!                           gmetempr(irowm,itot)*gmetempc(itot,icolm)
!                        enddo
!                      enddo
!                    enddo

                if (peinf%inode.eq.0) call timacc(30,2,tsec)
                
                if(pol%extraPolarMethod.eq.1) then
                  call die('ExtrapolarMethod currently broken for gcomm elements')
                endif

              endif ! pol%freq_dep .eq. 0

!---------------------
! JRD: Full Frequency Be Here

              if (pol%freq_dep .eq. 2) then

                itot=0
                if (peinf%inode.eq.0) call timacc(14,1,tsec)
                do iv=1,(vwfn%nband+pol%ncrit)
                  isend=peinf%global_pairowner(iv,j)-1
                  if (isend .lt. 0) write(0,*) 'Illegal value for mpi proc, isend: ',iv,j
                  if (isend .eq. peinf%inode) then
                    gmeRDyntempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv), &
                      ispin,irq) * sqrt(fact)
!                        edenDRtemp = pol%edenRDyn(:,iv,j,ispin,irq)*(-1D0)
                    if (abs(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irq)) .gt. Tol_Zero) then
                      do jj=1,pol%nfreq
                        edenDRtemp(jj)= -0.50d0 * ( &
                          1d0/(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irq)-(pol%dFreqGrid(jj)+pol%dFreqBrd(jj))/ryd)+ &
                          1d0/(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irq)+(pol%dFreqGrid(jj)+pol%dFreqBrd(jj))/ryd))
                      enddo
                    else
                      edenDRtemp(:)=0D0
                    endif
#ifdef CPLX
                    gmeADyntempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv),ispin,irq) * sqrt(fact)
!                        edenDAtemp = pol%edenADyn(:,iv,j,ispin,irq)*(-1D0)
                    if (abs(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irq)) .gt. Tol_Zero) then
                      do jj=1,pol%nfreq
                        edenDAtemp(jj)= -0.50d0 * ( &
                          1d0/(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irq)-(pol%dFreqGrid(jj)-pol%dFreqBrd(jj))/ryd)+ &
                          1d0/(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irq)+(pol%dFreqGrid(jj)-pol%dFreqBrd(jj))/ryd))
                      enddo
                    else
                      edenDAtemp(:)=0D0
                    endif
#endif
                    iowna(iv)=iowna(iv)+1
                  endif
                  
#ifdef MPI
                  call MPI_Bcast(gmeRDyntempn,pol%nmtx,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr)
                  call MPI_Bcast(edenDRtemp,pol%nFreq,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr)
#ifdef CPLX
                  call MPI_Bcast(gmeADyntempn,pol%nmtx,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr)
                  call MPI_Bcast(edenDAtemp,pol%nFreq,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr)
#endif
#endif

                  if (scal%npr*scal%npc .ne. 0) then
                    do it =1, nst(irq)
                      itot = itot + 1
                      do icurr=1,scal%npr
                        gmeRDyntempr(icurr,itot,:)=gmeRDyntempn( &
                          indt(scal%imyrow(icurr),it,irq))*pht(scal%imyrow(icurr),it,irq)*edenDRtemp
                      enddo
                      do icurr=1,scal%npc
                        gmeRDyntempc(itot,icurr) = &
                          CONJG(gmeRDyntempn(indt(scal%imycol(icurr),it,irq))*pht(scal%imycol(icurr),it,irq))
                      enddo
#ifdef CPLX
                      do icurr=1,scal%npr
                        gmeADyntempr(icurr,itot,:)=gmeADyntempn( &
                          indt(scal%imyrow(icurr),it,irq))*pht(scal%imyrow(icurr),it,irq)*edenDAtemp
                      enddo
                      do icurr=1,scal%npc
                        gmeADyntempc(itot,icurr) = &
                          CONJG(gmeADyntempn(indt(scal%imycol(icurr),it,irq))*pht(scal%imycol(icurr),it,irq))
                      enddo
#endif
                    enddo ! it
                  endif

                enddo ! iv

                if (peinf%inode.eq.0) call timacc(14,2,tsec)

! JRD: Using Level3 BLAS here for better performance

                if (peinf%inode.eq.0) call timacc(30,1,tsec)
                    
                if (scal%npr*scal%npc .ne. 0) then
                  do jj =1, pol%nfreq
                    call ZGEMM('n','n',scal%npr,scal%npc,ntot,(-1D0,0D0),gmeRDyntempr(:,:,jj),scal%npr, &
                      gmeRDyntempc(:,:),ntot,(1D0,0D0),chilocalRDyn(:,:,jj),scal%npr)
#ifdef CPLX
                    call ZGEMM('n','n',scal%npr,scal%npc,ntot,(-1D0,0D0),gmeADyntempr(:,:,jj),scal%npr, &
                      gmeADyntempc(:,:),ntot,(1D0,0D0),chilocalADyn(:,:,jj),scal%npr)
#endif
                  enddo
                endif

                if (peinf%inode.eq.0) call timacc(30,2,tsec)
                
              endif ! pol%freq_dep .eq. 2
                  
            enddo ! j (loop over conduction bands)

            if (pol%freq_dep .eq. 0) then
              if (scal%npr*scal%npc .ne. 0) then
                pol%chi(:,:,ispin) = pol%chi(:,:,ispin) + chilocal(:,:)
              endif
              SAFE_DEALLOCATE(gmetempr)
              SAFE_DEALLOCATE(gmetempc)
            endif
                
            if (pol%freq_dep.eq.2) then
              if (scal%npr*scal%npc .ne. 0) then
                do jj = 1, pol%nfreq
                  pol%chiRDyn(jj,:,:,ispin) = pol%chiRDyn(jj,:,:,ispin) + chilocalRDyn(:,:,jj)
#ifdef CPLX
                  pol%chiADyn(jj,:,:,ispin) = pol%chiADyn(jj,:,:,ispin) + chilocalADyn(:,:,jj)
#endif
                enddo
              endif
              SAFE_DEALLOCATE(gmeRDyntempr)
              SAFE_DEALLOCATE(gmeRDyntempc)
#ifdef CPLX
              SAFE_DEALLOCATE(gmeADyntempr)
              SAFE_DEALLOCATE(gmeADyntempc)
#endif
            endif
            
          enddo ! ispin (loop over spins)
        enddo ! irq (loop over k-points in set rq)
        
        if (pol%freq_dep .eq. 0) then
          SAFE_DEALLOCATE(chilocal)
        endif
        if (pol%freq_dep .eq. 2) then
          SAFE_DEALLOCATE(chilocalRDyn)
#ifdef CPLX
          SAFE_DEALLOCATE(chilocalADyn)
#endif
        endif


!------------------------------------------------------------------------------------------

! GCOMM MATRIX

      else ! pol%gcomm .eq. 0
        
        if (pol%freq_dep.eq.0) then
          SAFE_ALLOCATE(chilocal2, (scal%npr,scal%npc,kp%nspin))
          chilocal2=0
        endif
        
        if (pol%freq_dep.eq.2) then
          SAFE_ALLOCATE(chilocal2RDyn, (scal%npr,scal%npc,pol%nfreq,kp%nspin))
          chilocal2RDyn=0
#ifdef CPLX
          SAFE_ALLOCATE(chilocal2ADyn, (scal%npr,scal%npc,pol%nfreq,kp%nspin))
          chilocal2ADyn=0
#endif
        endif
 
        ntot=0
        ntot2=0
        
        ntot = peinf%nvownt*peinf%ncownt
        do irq = 1, nrq
          ntot2=ntot2 + nst(irq) 
        enddo
        
        ntot=ntot*ntot2

!-------------------------------------------------------------------
! Static Be Here

        if (pol%freq_dep .eq. 0) then
          
          do ipe = 1, peinf%npes
            SAFE_ALLOCATE(chilocal, (scal%nprd(ipe),scal%npcd(ipe)))
            chilocal=0D0
            SAFE_ALLOCATE(gmetempr, (scal%nprd(ipe),ntot))
            SAFE_ALLOCATE(gmetempc, (ntot,scal%npcd(ipe)))
            
            if(pol%extraPolarMethod.eq.1) then
              SAFE_ALLOCATE(chilocalExtra, (scal%nprd(ipe),scal%npcd(ipe)))
              chilocalExtra=0D0
              SAFE_ALLOCATE(gmetemprExtra, (scal%nprd(ipe),ntot))
              SAFE_ALLOCATE(gmetempcExtra, (ntot,scal%npcd(ipe)))
            endif
            
            do ispin = 1 , kp%nspin

!                   if(peinf%inode.eq.0) then
!                     do j=1,nrq
!                       write(6,*) 'j, nst(j)',j, nst(j)
!                     enddo
                    
!                   endif

              call mtxelMultiply(scal,ntot,nrq,nst,fact,vwfn, &
                gmetempn,gmetempr,gmetempc,chilocal,pol%gme,pol,indt,pht,ipe,ispin,tsec)
                  

! PWD: ok we need to make the matrix elements products for the 
! first extrapolar term 

              if (pol%extraPolarMethod.eq.1) then
                call mtxelMultiply(scal,ntot,nrq,nst,fact,vwfn, &
                  gmetempn,gmetempr,gmetempc,chilocalExtra,pol%gmeExtra,pol,indt,pht,ipe,ispin,tsec)

! PWD: this is the opposite of what you would expect due to the eden losing its sign
! via the clever application of its sqr to each element in mtxel, or is it the opposite of the
! opposite since we treat chilocal as if the sign was right  
                chilocal(:,:)= chilocal(:,:)-chilocalExtra(:,:)

                if(peinf%inode.eq.0) write (6,*) scal%nprd(ipe),scal%npr,scal%npcd(ipe),scal%npc
                call applyExtrapolarDeltaTerm(kp,nrq,nst,fact,vwfn,scal,gvec,indtExtra,phtExtra,&
                  pol,chilocal,deltaCount,ipe)

              endif ! pol%extraPolarMethod.eq.1

              if (peinf%inode.eq.0) call timacc(14,1,tsec)

#ifdef MPI
              call MPI_reduce(chilocal(1,1),chilocal2(1,1,ispin),scal%npcd(ipe)*scal%nprd(ipe),MPI_SCALAR, &
                MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr)
              
              if(pol%extraPolarMethod.eq.1) then
                call MPI_reduce(deltaCount(1,1),deltaCountReduce(1,1), &
                  pol%nmtx*pol%nmtx, MPI_INTEGER, MPI_SUM,ipe-1, MPI_COMM_WORLD,mpierr)
              endif
                  
#else
              chilocal2(:,:,ispin)=chilocal(:,:)
#endif
              if (peinf%inode.eq.0) call timacc(14,2,tsec)                  
            enddo ! ispin

            SAFE_DEALLOCATE(chilocal)
            SAFE_DEALLOCATE(gmetempr)
            SAFE_DEALLOCATE(gmetempc)
                
            if(pol%extraPolarMethod.eq.1) then
              SAFE_DEALLOCATE(chilocalExtra)
              SAFE_DEALLOCATE(gmetemprExtra)
              SAFE_DEALLOCATE(gmetempcExtra)
            endif
                
          enddo ! ipe

!               if(pol%extraPolarMethod.eq.1) then
!                 if(peinf%inode.eq.0) then
!                   write(6,*) 'pre delta term chilocal2(1,1,ispin)', chilocal2(1,1,ispin)
!                 endif

!                 call applyExtrapolarDeltaTerm(kp,nrq,nst,fact,vwfn,scal,gvec,indtExtra,phtExtra,&
!                  pol,chilocal2,deltaCount)

!                 if(peinf%inode.eq.0) then
!                   write(6,*) 'post delta term chilocal2(1,1,ispin)', chilocal2(1,1,ispin)
!                 endif
!               endif
              
          do ispin =1, kp%nspin
            pol%chi(:,:,ispin) = chilocal2(:,:,ispin)
          enddo
          SAFE_DEALLOCATE(chilocal2)

          if(pol%extraPolarMethod.eq.1.and.peinf%inode.eq.0) then
            write(6,*) 'Count of delta term application in upper left of chi0 matrix'
            write(countFile, *) 'countQ',iq
            call open_file(606,file=countFile,form='formatted',status='replace')
            do deltaCountI = 1,pol%nmtx
              write(606,*) deltaCountReduce(deltaCountI,1:pol%nmtx)
            enddo
            call close_file(606)
            deltaCount(:,:)=0
            deltaCountReduce(:,:)=0
          endif

        endif ! pol%freq_dep .eq. 0

!-------------------------------------
! Full Frequency Be Here

        if (pol%freq_dep .eq. 2) then
          do ipe = 1, peinf%npes

! CHP: It is painful to wait hours until we see the message
!      'Done Polarizability'. We need to know where we are!

            if(peinf%inode.eq.0) then
              write(6,'(A,i8,6x,A,i8,A)') '### ipe=',ipe,'(npes=',peinf%npes,')'
            endif
#ifdef MPI
            call MPI_barrier(MPI_COMM_WORLD,mpierr)
#endif

            SAFE_ALLOCATE(chilocalRDyn, (scal%nprd(ipe),scal%npcd(ipe),pol%nfreq))
            chilocalRDyn=0
#ifdef CPLX
            SAFE_ALLOCATE(chilocalADyn, (scal%nprd(ipe),scal%npcd(ipe),pol%nfreq))
            chilocalADyn=0
#endif

            SAFE_ALLOCATE(gmeRDyntempr, (scal%nprd(ipe),ntot,pol%nfreq))
            SAFE_ALLOCATE(gmeRDyntempr2, (scal%nprd(ipe)))
            SAFE_ALLOCATE(gmeRDyntempc, (ntot,scal%npcd(ipe)))
#ifdef CPLX
            SAFE_ALLOCATE(gmeADyntempr, (scal%nprd(ipe),ntot,pol%nfreq))
            SAFE_ALLOCATE(gmeADyntempr2, (scal%nprd(ipe)))
            SAFE_ALLOCATE(gmeADyntempc, (ntot,scal%npcd(ipe)))
#endif

            do ispin = 1 , kp%nspin
                  
              itot = 0
              
              if (peinf%inode.eq.0) call timacc(31,1,tsec)
              
              do irq = 1, nrq
                do iv = 1,vwfn%nband+pol%ncrit
                  if (peinf%doiownv(iv) .eq. 1) then
                    ilimit = peinf%ncownt
                  else
                    ilimit = 0
                  endif
                  do j = 1, ilimit
                    
                    gmeRDyntempn(:) = pol%gme(:,j,peinf%indexv(iv),ispin,irq) * sqrt(fact)
#ifdef CPLX
                    gmeADyntempn(:) = pol%gme(:,j,peinf%indexv(iv),ispin,irq) * sqrt(fact)
#endif
                    
                    zvalue = pol%edenDyn(peinf%indexv(iv),j,ispin,irq)
                    if (abs(zvalue) .gt. Tol_Zero) then

                      do jj=1,pol%nfreq
                        edenDRtemp(jj)= -0.5d0*(1d0/(zvalue-(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)+ &
                                                1d0/(zvalue+(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd))
                      enddo
#ifdef CPLX
                      do jj=1,pol%nfreq
                        edenDAtemp(jj)= -0.5d0*(1d0/(zvalue-(-pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)+ &
                                                1d0/(zvalue+(-pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd))
                      enddo
#endif
                    else
                      edenDRtemp(:)= 0D0
#ifdef CPLX
                      edenDAtemp(:)= 0D0
#endif
                    endif
                    
                    do it = 1, nst(irq) 
                      itot = itot + 1
                      
                      do icurr=1,scal%nprd(ipe)
                        gmeRDyntempr2(icurr)=gmeRDyntempn(indt(scal%imyrowd(icurr,ipe),it,irq))*pht( &
                          scal%imyrowd(icurr,ipe),it,irq)
                      enddo
                      do jj = 1, pol%nfreq
                        gmeRDyntempr(:,itot,jj)=gmeRDyntempr2(:)*edenDRtemp(jj)
                      enddo
                      do icurr=1,scal%npcd(ipe)
                        gmeRDyntempc(itot,icurr) = &
                        MYCONJG(gmeRDyntempn(indt(scal%imycold(icurr,ipe),it,irq))*pht(scal%imycold(icurr,ipe),it,irq))
                      enddo
#ifdef CPLX
                      do icurr=1,scal%nprd(ipe)
                        gmeADyntempr2(icurr)=gmeADyntempn( &
                          indt(scal%imyrowd(icurr,ipe),it,irq))*pht(scal%imyrowd(icurr,ipe),it,irq)
                      enddo
                      do jj = 1, pol%nfreq
                        gmeADyntempr(:,itot,jj)=gmeADyntempr2(:)*edenDAtemp(jj)
                      enddo
                      do icurr=1,scal%npcd(ipe)
                        gmeADyntempc(itot,icurr)= &
                          MYCONJG(gmeADyntempn(indt(scal%imycold(icurr,ipe),it,irq))*pht(scal%imycold(icurr,ipe),it,irq))
                      enddo
#endif
                    enddo ! it
                    
                  enddo ! j
                enddo ! iv
              enddo ! irq
              
              if (peinf%inode.eq.0) call timacc(31,2,tsec)
                  
              if (peinf%inode.eq.0) call timacc(30,1,tsec)
              
              do jj =1, pol%nfreq
                call ZGEMM('n','n',scal%nprd(ipe),scal%npcd(ipe),ntot, &
                  (-1D0,0D0),gmeRDyntempr(:,:,jj),scal%nprd(ipe),gmeRDyntempc(:,:),ntot, &
                  (0D0,0D0),chilocalRDyn(:,:,jj),scal%nprd(ipe))
#ifdef CPLX
                call ZGEMM('n','n',scal%nprd(ipe),scal%npcd(ipe),ntot, &
                  (-1D0,0D0),gmeADyntempr(:,:,jj),scal%nprd(ipe),gmeADyntempc(:,:),ntot, &
                  (0D0,0D0),chilocalADyn(:,:,jj),scal%nprd(ipe))
#endif
              enddo
              
              if (peinf%inode.eq.0) call timacc(30,2,tsec)
              
              if (peinf%inode.eq.0) call timacc(14,1,tsec)
              
#ifdef MPI
              call MPI_reduce(chilocalRDyn(1,1,1),chilocal2RDyn(1,1,1,ispin), &
                pol%nfreq*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr)
#ifdef CPLX
              call MPI_reduce(chilocalADyn(1,1,1),chilocal2ADyn(1,1,1,ispin), &
                pol%nfreq*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr)
#endif

#else
              chilocal2RDyn(:,:,:,ispin)=chilocalRDyn(:,:,:)
#ifdef CPLX
              chilocal2ADyn(:,:,:,ispin)=chilocalADyn(:,:,:)
#endif
              
#endif
              if (peinf%inode.eq.0) call timacc(14,2,tsec)
              
            enddo ! ispin
            
            SAFE_DEALLOCATE(chilocalRDyn)
            SAFE_DEALLOCATE(gmeRDyntempr)
            SAFE_DEALLOCATE(gmeRDyntempr2)
            SAFE_DEALLOCATE(gmeRDyntempc)
#ifdef CPLX
            SAFE_DEALLOCATE(chilocalADyn)
            SAFE_DEALLOCATE(gmeADyntempr)
            SAFE_DEALLOCATE(gmeADyntempr2)
            SAFE_DEALLOCATE(gmeADyntempc)
#endif
          enddo ! ipe

          do ispin =1, kp%nspin
            do jj=1,pol%nfreq
              pol%chiRDyn(jj,:,:,ispin) = chilocal2RDyn(:,:,jj,ispin)
#ifdef CPLX
              pol%chiADyn(jj,:,:,ispin) = chilocal2ADyn(:,:,jj,ispin)
#endif
            enddo ! jj
          enddo ! ispin
          SAFE_DEALLOCATE(chilocal2RDyn)
#ifdef CPLX
          SAFE_DEALLOCATE(chilocal2ADyn)
#endif
        endif ! pol%freq_dep .eq. 2
        
      endif ! pol%gcomm .eq. 0
          
      if(peinf%inode.eq.0) call timacc(15,2,tsec)
      
      if (peinf%inode .eq. 0) write(6,'(/,1x,"Done Polarizability")')


! Done ChiSum
!-----------------------------------------------------------------------


!--------- Print Out some Matrix Elements -----------------------------

! JRD: Print out convergence

      if (pol%freq_dep .eq. 0 .and. peinf%inode .eq. 0) then
        SAFE_ALLOCATE(convmintotal, (cwfn%nband - vwfn%nband))
        SAFE_ALLOCATE(convmaxtotal, (cwfn%nband - vwfn%nband))
        convmintotal=0d0
        convmaxtotal=0d0
        
        convmintotal(1)=convmin(1)
        convmaxtotal(1)=convmax(1)
        do j = 2, cwfn%nband - vwfn%nband
          convmintotal(j)=convmintotal(j-1)+convmin(j)
          convmaxtotal(j)=convmaxtotal(j-1)+convmax(j)
        enddo
        
        idis = (cwfn%nband - vwfn%nband)/10
        nct = cwfn%nband - vwfn%nband
        if (idis.gt.0.and.idis.lt.nct) then
          convminval = (nct*convmintotal(nct) - (nct-idis)*convmintotal(nct-idis)) / idis
          convmaxval = (nct*convmaxtotal(nct) - (nct-idis)*convmaxtotal(nct-idis)) / idis
        else
          convminval = 0.0d0
          convmaxval = 0.0d0
        endif
        
        write(strhead,701)1
        write(strtail,701)pol%nmtx
        
        if (pol%fullConvLog .eq. 0) then
          write(17,'(a,3e16.8)') '# q=', pol%qpt(:,iq)
          write(17,'(a1,a7,4a20)') '#', 'ncbands', 'Re chi(0,0)', 'extrap', 'Re chi(Gmax,Gmax)', 'extrap'
          
          do j = 1, cwfn%nband - vwfn%nband
            write(17,'(i8,4e20.8)') j, dble(convmintotal(j)), &
              dble(convminval), dble(convmaxtotal(j)), dble(convmaxval)
          enddo
          write(17,*)
        elseif (pol%fullConvLog .eq. 1) then
          write(17,801) pol%qpt(:,iq), iq
          write(17,802)
          
          write(17,803)TRUNC(strhead),TRUNC(strhead)
          do j = 1, cwfn%nband - vwfn%nband
            write(17,805) j, dble(convmintotal(j))
          enddo
          
          write(17,804)TRUNC(strtail),TRUNC(strtail)
          do j = 1, cwfn%nband - vwfn%nband
            write(17,805) j, dble(convmaxtotal(j))
          enddo
        elseif (pol%fullConvLog .eq. 2) then
          write(17,801) pol%qpt(:,iq), iq
          write(17,802)
          
          write(17,803)TRUNC(strhead),TRUNC(strhead)
          do j = 1, cwfn%nband - vwfn%nband
            write(17,805) j, convmintotal(j)
          enddo
          
          write(17,804)TRUNC(strtail),TRUNC(strtail)
          do j = 1, cwfn%nband - vwfn%nband
            write(17,805) j, convmaxtotal(j)
          enddo
        endif

701     format(i16)
801     format('#',1x,'q =',3f10.6,1x,'iq =',i4)
802     format(2x,'nbands',1x,'chi0')
803     format(2x,'head ig =',1x,a,1x,'igp =',1x,a)
804     format(2x,'tail ig =',1x,a,1x,'igp =',1x,a)
805     format(i8,2e16.8)

        SAFE_DEALLOCATE(convmintotal)
        SAFE_DEALLOCATE(convmaxtotal)
      endif ! pol%freq_dep .eq. 0 .and. peinf%inode .eq. 0

!          write(6,*) 'End Convergence Writing'

      SAFE_DEALLOCATE(pht)
      SAFE_DEALLOCATE(indt)
      SAFE_DEALLOCATE(ind)
      SAFE_DEALLOCATE(ph)
      SAFE_DEALLOCATE(nst)
      SAFE_DEALLOCATE_P(pol%gme)
      if(pol%extraPolarMethod.eq.1) then
        SAFE_DEALLOCATE(deltaCount)
        SAFE_DEALLOCATE(deltaCountReduce)
        SAFE_DEALLOCATE_P(pol%gmeExtra)            
        SAFE_DEALLOCATE_P(pol%gpgExtraPolar)
        SAFE_DEALLOCATE(phExtra)
        SAFE_DEALLOCATE(phtExtra)
        SAFE_DEALLOCATE(indExtra)
        SAFE_DEALLOCATE(indtExtra)
        SAFE_DEALLOCATE(gmetempnExtra)
      endif
      
      if (pol%freq_dep .eq. 0) then
        SAFE_DEALLOCATE(gmetempn)
      endif
      if (pol%freq_dep .eq. 2) then
        SAFE_DEALLOCATE(gmeRDyntempn)
        SAFE_DEALLOCATE(tempvalRDyn)
        SAFE_DEALLOCATE_P(pol%edenDyn)
        SAFE_DEALLOCATE(edenDRtemp)
        SAFE_DEALLOCATE(chiRDyntmp)
#ifdef CPLX
        SAFE_DEALLOCATE(edenDAtemp)
        SAFE_DEALLOCATE(gmeADyntempn)
        SAFE_DEALLOCATE(tempvalADyn)
        SAFE_DEALLOCATE(chiADyntmp)
#endif
      endif

      SAFE_DEALLOCATE(iowna)

      if (pol%freq_dep .eq. 0) then
        SAFE_DEALLOCATE(convmin)
        SAFE_DEALLOCATE(convmax)
        SAFE_DEALLOCATE(convmin2)
        SAFE_DEALLOCATE(convmax2)
      endif

    else ! pol%skip_chi

      if(iflagq02 .eq. 1)  then
        
        itape=10
        if(peinf%inode.eq.0) then
          write(6,'(a)') 'Reading from file chi0mat.'
          call open_file(unit=10,file='chi0mat',form='unformatted',status='old')
          read(10)
          read(10)
          read(10)
          read(10)
          read(10)
          read(10)
          read(10)
          read(10)
          read(10)
          read(10)
          read(10) 
          read(10) nmtx_t
!                write(6,*) 'nmtx_t for chi0mat', nmtx_t
        endif
#ifdef MPI
        call mpi_barrier(MPI_COMM_WORLD,mpierr)
#endif
        do ispin=1,kp%nspin
          if (pol%freq_dep .eq. 0) then
            call read_matrix_d(scal,pol%chi(:,:,ispin),pol%nmtx,itape)
          endif ! pol%freq_dep .eq. 0
          if (pol%freq_dep .eq. 2) then
            call read_matrix_f(scal,pol%nFreq,pol%chiRDyn(:,:,:,ispin), &
              pol%chiADyn(:,:,:,ispin),pol%nmtx,itape)
          endif ! pol%freq_dep .eq. 2
        enddo ! ispin

      else ! iflagq02 .eq. 1

        itape=11
        if(peinf%inode.eq.0) then
          write(6,'(a)') 'Reading from file chimat.'
          if (initial_access .eq. 0) then
            call open_file(unit=11,file='chimat',form='unformatted',status='old')
            read(11)
            read(11)
            read(11)
            read(11)
            read(11)
            read(11)
            read(11)
            read(11)
            read(11)
            read(11)
          endif
          read(11)
          read(11) nmtx_t
!                write(6,*) 'nmtx_t for chimat', nmtx_t
        endif
#ifdef MPI
        call mpi_barrier(MPI_COMM_WORLD,mpierr)
#endif
        do ispin=1,kp%nspin
          if (pol%freq_dep .eq. 0) then
            call read_matrix_d(scal,pol%chi(:,:,ispin),pol%nmtx,itape)
          endif ! pol%freq_dep .eq. 0
          if (pol%freq_dep .eq. 2) then
            call read_matrix_f(scal,pol%nFreq,pol%chiRDyn(:,:,:,ispin),pol%chiADyn(:,:,:,ispin),pol%nmtx,itape)
          endif ! pol%freq_dep .eq. 2

        enddo ! ispin
        initial_access = 1
      endif ! iflagq02 .eq. 1

    endif ! pol%skip_chi

!-----------------------------------------------------------------------
! JRD: Now write out elements that Proc 1 owns

    do ispin = 1, kp%nspin

      if (pol%freq_dep.eq.0 .and. peinf%inode.eq.0) then
        write(7,940) ispin,kp%nspin
        do i=1,scal%npr
          ix=scal%isrtxrow(i)
          do j=1,scal%npc

! JRD: Diagonal, subdiagonal and wings only

            jx=scal%isrtxcol(j)
            if(i.eq.j .or. (gvec%k(1,ix) .eq. 0 .and. gvec%k(2,ix) .eq. 0 .and. gvec%k(3,ix) .eq. 0)) &
              write(7,950) (gvec%k(k,ix),k=1,3),gvec%ekin(ix),(gvec%k(k,jx),k=1,3),gvec%ekin(jx),pol%chi(i,j,ispin)
          enddo
        enddo
      endif ! pol%freq_dep.eq.0 .and. peinf%inode.eq.0
      
      if (pol%freq_dep.eq.2 .and. peinf%inode.eq.0) then
        write(7,940) ispin,kp%nspin
        do i=1,scal%npr
          ix=scal%isrtxrow(i)
          do j=1,scal%npc
            
! JRD: Diagonal and subdiagonal only

            jx=scal%isrtxcol(j)
            if(i.eq.j) &
              write(7,950) (gvec%k(k,ix),k=1,3),gvec%ekin(ix),(gvec%k(k,jx),k=1,3),gvec%ekin(jx),pol%chiRDyn(1,i,j,ispin)
          enddo
        enddo
!            write(7,940) ispin,kp%nspin

        ijkfreq = pol%nfreq / 2
        if (ijkfreq .ge. 2) then

          write(7,*)
          write(7,*) "frq = ", ijkfreq
          write(7,*)

          do i=1,scal%npr
            ix=scal%isrtxrow(i)
            do j=1,scal%npc

! JRD: Diagonal and subdiagonal only

              jx=scal%isrtxcol(j)
              if(i.eq.j) &
                write(7,950) (gvec%k(k,ix),k=1,3),gvec%ekin(ix),(gvec%k(k,jx),k=1,3),gvec%ekin(jx),pol%chiRDyn(ijkfreq,i,j,ispin)
            enddo
          enddo
        endif
      endif ! pol%freq_dep.eq.2 .and. peinf%inode.eq.0
      
940   format(/,10x,' independent matrix elements of chi', 7x,'spin index= ',1i1,1x,1i1,/,/,&
        10x,'g',10x,'g**2',10x,'gp',10x,'gp**2',10x,'chi(g,gp)')
! if last value is real, only one of the f13.8 will be used.
950   format(3i5,f10.5,5x,3i5,f10.5,5x,2f15.10)

    enddo ! ispin (loop over spins)

!        write(6,*) 'End Element Writing'


!--------- write polarizability matrix and crystal info on tape ---------

    if (pol%skip_epsilon) then
      
      itape=11
      if(iflagq02.gt.0) itape=10

      if(peinf%inode.eq.0) then
        write(itape) syms%ntranq,(((syms%mtrx(i,j,syms%indsub(n)),i=1,3),j=1,3), &
          (syms%tnp(k,syms%indsub(n)),syms%kgzero(k,n),k=1,3),n=1,syms%ntranq)
        np=pol%nmtx*(pol%nmtx+1)/2
        write(itape) pol%nmtx,np,(pol%isrtx(i),gvec%ekin(i),i=1,gvec%ng),(pol%irow(i),i=1,pol%nmtx)
      endif

      do ispin=1,kp%nspin
        if (pol%freq_dep .eq. 0) then
          call write_matrix_d(scal,pol%chi(:,:,ispin),pol%nmtx,itape)
        endif ! pol%freq_dep .eq. 0
        if (pol%freq_dep .eq. 2) then
          call write_matrix_f(scal,pol%nFreq,pol%chiRDyn(:,:,:,ispin), &
#ifdef CPLX
            pol%chiADyn(:,:,:,ispin),&
#endif
            pol%nmtx, itape)
        endif ! pol%freq_dep .eq. 2
      enddo ! ispin

    endif ! pol%skip_epsilon

!        write(6,*) 'End Matrix Writing'

! Use pol%chi(j,1) as sum over spin components
! JRD: Why was proc 0 the only one doing this??!!

    if (pol%freq_dep .eq. 0) then
      if(kp%nspin.eq.2) pol%chi(:,:,1)=pol%chi(:,:,1)+pol%chi(:,:,2)
    endif ! pol%freq_dep .eq. 0
    if (pol%freq_dep .eq. 2) then
      if(kp%nspin.eq.2) pol%chiRDyn(:,:,:,1)=pol%chiRDyn(:,:,:,1)+pol%chiRDyn(:,:,:,2)
#ifdef CPLX
      if(kp%nspin.eq.2) pol%chiADyn(:,:,:,1)=pol%chiADyn(:,:,:,1)+pol%chiADyn(:,:,:,2)
#endif
    endif ! pol%freq_dep .eq. 2

    if (peinf%inode .eq. 0) then
      call timacc(13,1,tsec)
#ifdef VERBOSE
      call logit('Calling epsinv')
#endif
    endif ! peinf%inode .eq. 0

    if (.not. pol%skip_epsilon) then

      call epsinv(gvec,pol,q0,q0norm,iflagq02,crys,scal,kp,omega_plasma)

      if(peinf%inode.eq.0) then
#ifdef VERBOSE
        call logit('Finished epsinv')
#endif
        call timacc(13,2,tsec)
      endif ! peinf%inode.eq.0
      
    endif ! pol%skip_epsilon

    if (pol%freq_dep .eq. 0) then
      SAFE_DEALLOCATE_P(pol%chi)
    endif
    if (pol%freq_dep .eq. 2) then
      SAFE_DEALLOCATE_P(pol%chiRDyn)
#ifdef CPLX
      SAFE_DEALLOCATE_P(pol%chiADyn)
#endif
    endif

    SAFE_DEALLOCATE(indrq)
    SAFE_DEALLOCATE(neq)

    SAFE_DEALLOCATE_P(pol%isrtx)
    SAFE_DEALLOCATE_P(pol%isrtxi)
    if(pol%extraPolarMethod.eq.1) then
      SAFE_DEALLOCATE_P(pol%isrtxExtra)
      SAFE_DEALLOCATE_P(pol%isrtxiExtra)
    endif
    SAFE_DEALLOCATE_P(pol%irow)
    SAFE_DEALLOCATE(irowv)
    SAFE_DEALLOCATE(icolv)
    SAFE_DEALLOCATE(irowmv)
    SAFE_DEALLOCATE(icolmv)
    SAFE_DEALLOCATE(irowmvg)
    SAFE_DEALLOCATE(icolmvg)
    SAFE_DEALLOCATE_P(scal%isrtxcol)
    SAFE_DEALLOCATE_P(scal%isrtxrow)
    SAFE_DEALLOCATE_P(scal%imycol)
    SAFE_DEALLOCATE_P(scal%imyrow)
    SAFE_DEALLOCATE_P(scal%imycold)
    SAFE_DEALLOCATE_P(scal%imyrowd)

  enddo ! iq (loop over q points)

  SAFE_DEALLOCATE_P(scal%nprd)
  SAFE_DEALLOCATE_P(scal%npcd)

! End q point loop!
!-------------------------------------------------------------------


!----------- Clean House -------------------------------------------

#ifdef VERBOSE
  call logit('Cleaning up')
#endif

  if (.not. pol%skip_epsilon) then
    call destroy_qran()
  endif

  if(.not. pol%skip_chi) then
    if(peinf%inode == 0) call close_file(17) ! file chi_converge.dat
    call destroy_fftw_plans()
  endif
  if (pol%iwritecoul .eq. 1) then
    if (peinf%inode .eq. 0) then
      call close_file(19) ! file vcoul
    endif
  endif

  SAFE_DEALLOCATE_P(gvec%ekin)
  SAFE_DEALLOCATE_P(kp%w)
  SAFE_DEALLOCATE_P(kp%rk)
  SAFE_DEALLOCATE_P(kp%el)

  if(pol%extraPolarMethod.eq.1) then
    SAFE_DEALLOCATE(ekinxExtra) 
  endif

  if(indexq0 .gt. 0 .and. valueq0 .eq. 1 .and. pol%iqexactlyzero .eq. 0) then
    SAFE_DEALLOCATE_P(kpq%w)
    SAFE_DEALLOCATE_P(kpq%rk)
    SAFE_DEALLOCATE_P(kpq%el)
  endif
  SAFE_DEALLOCATE_P(gvec%k)
  SAFE_DEALLOCATE_P(gvec%indv)
  SAFE_DEALLOCATE_P(pol%qpt)
  SAFE_DEALLOCATE_P(vwfn%isort)
  SAFE_DEALLOCATE_P(cwfn%isort)
  SAFE_DEALLOCATE_P(cwfn%ekin)
  SAFE_DEALLOCATE_P(cwfn%band_index)
  SAFE_DEALLOCATE_P(peinf%global_nvown)
  SAFE_DEALLOCATE_P(peinf%global_ncown)
  SAFE_DEALLOCATE_P(peinf%indexc)
  SAFE_DEALLOCATE_P(peinf%indexv)
  SAFE_DEALLOCATE_P(peinf%invindexv)
  SAFE_DEALLOCATE_P(peinf%invindexc)
  SAFE_DEALLOCATE_P(peinf%doiownv)
  SAFE_DEALLOCATE_P(peinf%doiownc)
  SAFE_DEALLOCATE_P(peinf%global_pairowner)
!      SAFE_DEALLOCATE_P(peinf%ciown)
  if(peinf%inode.eq.0) then
    call close_file(7) ! epsilon.log

    nqnonzero = pol%nq
    if (indexq0.gt.0)  nqnonzero = pol%nq - 1
    
    if (pol%skip_epsilon) then      
      if (indexq0.gt.0) call close_file(10) ! chi0mat
      if (nqnonzero > 1) call close_file(11) ! chimat 
    else 
      if (indexq0.gt.0) call close_file(12) ! eps0mat 
      if (nqnonzero > 1) call close_file(13) ! epsmat 
    endif ! pol%skip_epsilon 
  endif

  if (pol%iwriteint .ne. 0) then
    SAFE_DEALLOCATE_P(intwfnv%ng)
    SAFE_DEALLOCATE_P(intwfnv%isort)
    SAFE_DEALLOCATE_P(intwfnv%cg)
    SAFE_DEALLOCATE_P(intwfnv%qk)
    SAFE_DEALLOCATE_P(intwfnv%el)
    if(indexq0 .gt. 0 .and. valueq0 .eq. 1 .and. pol%iqexactlyzero .eq. 0) then
      SAFE_DEALLOCATE_P(intwfnvq%ng)
      SAFE_DEALLOCATE_P(intwfnvq%isort)
      SAFE_DEALLOCATE_P(intwfnvq%cg)
      SAFE_DEALLOCATE_P(intwfnvq%qk)
      SAFE_DEALLOCATE_P(intwfnvq%el)
    endif
    SAFE_DEALLOCATE_P(intwfnc%ng)
    SAFE_DEALLOCATE_P(intwfnc%isort)
    SAFE_DEALLOCATE_P(intwfnc%cg)
    SAFE_DEALLOCATE_P(intwfnc%el)
    SAFE_DEALLOCATE_P(intwfnc%cbi)
    SAFE_DEALLOCATE_P(intwfnc%qk)
  endif


!------------- Print Timing Info -----------------------------------------


#ifdef MPI
  call MPI_barrier(MPI_COMM_WORLD,mpierr)
#endif

#ifdef VERBOSE
  call logit('Calculating Timing Info')
#endif

  routnam(1)='TOTAL:'
  routnam(2)='INPUT:'
  routnam(3)='INPUT_Q:'
  routnam(4)='FULLBZ:'
  routnam(5)='GVEC:'
  routnam(6)='SUBGRP:'
  routnam(7)='INDEP:'
  routnam(8)='IRRBZ:'
  routnam(9)='GENWF:'
  routnam(10)='MTXEL:'
  routnam(11)='RQSTAR:'
  routnam(12)='GMAP:'
  routnam(13)='EPSINV (TOTAL):'
  routnam(14)='CHI SUM (COMM):'
  routnam(15)='CHI SUM (TOTAL):'
  routnam(16)='GENWF (VAL):'
  routnam(17)='GENWF (COND):'
  routnam(18)='EPSINV (VCOUL):'
  routnam(19)='JOB SETUP:'
  routnam(20)='Q LOOP SETUP:'
  routnam(21)='INIT CUTOFF:'
  routnam(22)='INIT SCALAPACK:'
  routnam(23)='INIT ARRAYS:'
  routnam(24)='CONVERGE TESTS:'
  routnam(25)='MTXEL (DENOM):'
  routnam(26)='MTXEL (FFT):'
  routnam(28)='GENWF (C-Ekin):'
  routnam(29)='GENWF (C-Sort):'
  routnam(30)='CHI SUM (' // TOSTRING(X(GEMM)) // '):'
  routnam(31)='CHI SUM (PREP):'
  routnam(32)='MTXEL EXP(DENOM):'
  routnam(33)='MTXEL EXP (FFT):'
  
  routsrt=(/ 1, 4, 8, 6,11,12, 5, 7, 2, 3,9,16,17,28,29,19,20,21,22,23, &
    10,25,26,32,33,15,14,30,31,13,18,24 /)

  if(peinf%inode.eq.0) then
    call timacc(1,2,tsec)
    write(6,*)
    write(6,9000) 'CPU [s]','WALL [s]','#'
    write(6,*)
    do i=2,ubound(routsrt, 1)
      call timacc(routsrt(i),3,tsec,ncount)
      write(6,9001) routnam(routsrt(i)),tsec(1),tsec(2),ncount
    enddo
    write(6,*)
    call timacc(routsrt(1),3,tsec,ncount)
    write(6,9002) routnam(routsrt(1)),tsec(1),tsec(2)
    write(6,*)
    write(6,*)
9000 format(24x,a13,  3x,a13,  3x,a9)
9001 format(a24,f13.3,3x,f13.3,3x,i9)
9002 format(a24,f13.3,3x,f13.3)
  endif

  call write_memory_usage()
  
  if (pol%iwriteint .eq. 0) then
    itpc = 100028+peinf%inode
    write(filename,'(a,i4.4)') 'INT_CWFN_', peinf%inode
    call open_file(itpc, file = filename, status='old')
    call close_file(itpc, delete = .true.) ! files INT_CWFN_*

    if(peinf%inode == 0) then
      itpv = 200028
      call open_file(itpv, file = 'INT_VWFN', status='old')
      call close_file(itpv, delete = .true.) ! files INT_VWFN

      ! this is the condition for calling input_q above
      if(indexq0 .gt. 0 .and. valueq0 .eq. 1 .and. pol%iqexactlyzero .eq. 0) then
        itpv = 300028
        call open_file(itpv, file = 'INT_VWFQ', status='old')
        call close_file(itpv, delete = .true.) ! files INT_VWFQ
      endif
    endif
  endif

#ifdef MPI
  call MPI_Finalize(mpierr)
#endif

end program epsilon
