!=======================================================================
!
! Routines:
!
! 1. haydock      Originally By MLT       Last Modified 6/24/08 (JRD)
!
!     Calculates the real and imaginary parts of the macroscopic dielectric
!     function starting from interaction matrix elements calculated by
!     the Kernel code. It uses interpolation in the matrix elements and
!     the Haydock recursion method. Spin-polarized case implemented.
!
!     For more details, see:
!     Rohlfing & Louie, PRB 62:(8), 4927 (2000)
!     Benedict & Shirley, PRB 59:(8), 5441 (1999)
!     R. Haydock, Comput. Phys. Commun. 20, 11 (1980)
!     G. Strinati, Rivista del Nuovo Cimento 11:(12), 1 (1988)
!
!     Please report bugs to: jdeslip@civet.berkeley.edu
!
!========================================================================

#include "f_defs.h"

subroutine haydock(eqp,xct,flag,nmax)

  use global_m
  use genwf_m
  use intkernel_m
  use intwfn_m
  use random_m
  implicit none

  type (eqpinfo), intent(inout) :: eqp
  type (xctinfo), intent(inout) :: xct
  type (flags), intent(in) :: flag
  integer, intent(in) :: nmax

  type (crystal) :: crys
  type (mmtsinfo) :: mmts
  type (symmetry) :: syms
  type (gspace) :: gvec
  type (epsinfo) :: epsi
  type (grid) :: kg_fi, kgq_fi
  type (kpoints) :: kp_fi, kpq_fi
  type (wavefunction) :: wfnc_fi
  type (wavefunction) :: wfnvq_fi
  type (work_genwf) :: work, workq
  type (int_wavefunction) :: intwfnc
  type (int_wavefunction) :: intwfnv

  character :: tmpstr*128,filename*20
  integer :: ii,jj,ncount,ntim,nmat,nblock
  integer :: ikb, icb, ivb
  integer :: ik,ikq,ikt,ikrq,ikcvs,ikcvst,ic,iv,is
  integer :: seed, itpc,itpv
  integer :: ih,ith,n0
  real(DP) :: vol,vol1,sum,sum1
  real(DP) :: tsec(2),tmin(2),tmax(2),omega_plasma
  
  character*16, allocatable :: routnam(:)
  integer, allocatable :: imap(:),indexq_fi(:)
  real(DP), allocatable :: kco(:,:)
  SCALAR, allocatable :: &
    dcc(:,:,:,:),dvv(:,:,:,:),s1(:),s1_l(:),s1k(:,:,:),dummy(:), &
    hmtrx(:,:),s0(:),s0_l(:),hqpcc(:,:,:),hqpvv(:,:,:)

!----------------------
! Initialize random numbers

  peinf%jobtypeeval = 1

!----------------------
! Initialize wcoul0        

  xct%wcoul0 = 0d0

!----------------------
! Initialize timer

  call timacc(1,0,tsec)
  call timacc(1,1,tsec)

!----------------------
! Initialize files

  mmts%nmax = nmax

  if (xct%iwritecoul .eq. 1) then
    if (peinf%inode .eq. 0) then
      call open_file(19,file='vcoul',form='formatted',status='replace')
    endif
  endif

  if (xct%skpt.ne.0.and.peinf%inode.eq.0) then
    write(0,*) 'WARNING: you are using partial sampling with the Haydock iterative scheme!'
    write(0,*) 'The optical spectrum may not be meaningful...'
  endif
  if(mmts%nmax.eq.0.and.flag%spec.ne.1) then
    call die('missing number of iterations')
  endif

! If flag%spec.eq.1, just calculate the spectrum from input coefficients

  ith=21
  if (flag%spec.eq.1) then
    if (peinf%inode.eq.0) then
      call open_file(ith,file='eps2_moments',form='unformatted',status='old')
      read(ith) mmts%nmax,mmts%norm,mmts%vol,ii,xct%nspin
      SAFE_ALLOCATE(mmts%an, (mmts%nmax))
      SAFE_ALLOCATE(mmts%bn, (mmts%nmax))
      read(ith) (mmts%an(ii),ii=1,mmts%nmax)
      read(ith) (mmts%bn(ii),ii=1,mmts%nmax)
      write(6,*)
      write(6,*) 'Reading coefficients from file; nmax = ',mmts%nmax, ' norm= ',mmts%norm
      write(6,*) '     n    a(n)        b(n)  '
      do ii=1,mmts%nmax
        write(6,120) ii, mmts%an(ii),mmts%bn(ii)
      enddo
      call close_file(ith)
    endif
120 format(3x,i4,3x,2e12.5)

  else

!-----------------------
! Read wavefunctions on the fine grid

  call logit('Calling input')
  call timacc(2,1,tsec)
  call input(crys,gvec,kg_fi,kp_fi,syms,eqp,xct,flag, &
    omega_plasma,.false.,intwfnc,.false.)
  
! If there is no specified number of eigenvalues, calculate
! all eigenvalues

  nmat = xct%nkpt*xct%ncband*xct%nvband*xct%nspin
  vol = xct%nktotal*crys%celvol
  if (peinf%inode.eq.0) then
    write(6,'(a,f32.14,a)') 'Crystal volume = ',vol,' a.u.'
    write(6,*) ' number of valence bands = ',xct%nvband
    write(6,*) ' number of cond. bands   = ',xct%ncband
    write(6,*) ' number of spins   = ',xct%nspin
    write(6,'(a,f7.4,a)') ' Broadening: ',xct%eta,' eV'
  endif
  call timacc(2,2,tsec)

  SAFE_ALLOCATE(indexq_fi, (xct%nkpt))
  if (flag%vm.eq.0.or.flag%dtm.ne.1) then
    call timacc(3,1,tsec)
    call logit('Calling input_q')
    call input_q(kp_fi,crys,gvec,kg_fi,kgq_fi,kpq_fi, &
      syms,xct,indexq_fi,eqp,flag,intwfnv,.false.)
    call timacc(3,2,tsec)
  endif

!----------------------------
! Calculate the transformation matrices from coarse-grid wavefunctions

  SAFE_ALLOCATE(dcc, (xct%nkpt,xct%ncband,xct%ncb_co,xct%nspin))
  SAFE_ALLOCATE(dvv, (xct%nkpt,xct%nvband,xct%nvb_co,xct%nspin))
  SAFE_ALLOCATE(kco, (3,xct%nkpt_co))
  SAFE_ALLOCATE(imap, (xct%nkpt))

  call logit('Calling intwfn')
  call timacc(4,1,tsec)
  call intwfn(kp_fi,crys,syms,xct,flag,gvec,kg_fi,kgq_fi, &
    dcc,dvv,kco,imap,indexq_fi,eqp,intwfnv,intwfnc)
  call timacc(4,2,tsec)

  SAFE_DEALLOCATE_P(xct%ifmax)
  if (flag%vm.eq.0.or.flag%dtm.ne.1) then
    ! otherwise, we did not call input_q to allocate it
    SAFE_DEALLOCATE_P(xct%ifmaxq)
  endif

!----------------------------
! Initialize recursion states and coefficients.
! If there is a previous run, read previous states from tape ith and
! skip the calculation of velocity/momentum matrix elements.
!
!       nblock : size of cvs block in hmtrx

  nblock = xct%ncband*xct%nvband*xct%nspin
  if (xct%ipar .eq. 1) then
    peinf%nblockd=nblock
  else if (xct%ipar .eq. 2) then
    peinf%nblockd = xct%nvband*xct%nspin
  else    
    peinf%nblockd = xct%nspin
  endif
  nmat= xct%nkpt*nblock
  SAFE_ALLOCATE(s1, (nmat))
  SAFE_ALLOCATE(s1k, (xct%ncband,xct%nvband,xct%nspin))
  s1= 0.d0
  s1k= 0.d0

! Since this code always compute an odd number of an's and bn's,
! it will calculate nmax+1 coefficients if nmax is even
! solution: overshoot the array size

  mmts%nmaxp=mmts%nmax+1
  SAFE_ALLOCATE(mmts%an, (mmts%nmaxp))
  SAFE_ALLOCATE(mmts%bn, (mmts%nmaxp))
  mmts%an(:) = 0.d0
  mmts%bn(:) = 0.d0
  mmts%vol = vol
  SAFE_ALLOCATE(s0, (nmat))
  s0 = 0.d0

  if (flag%vm.eq.2) then
    if (peinf%inode.eq.0) then
      call open_file(ith,file='eps2_moments',form='unformatted',status='old')
      read(ith) n0,mmts%norm,vol1,ii,is
      if ((abs(vol1-mmts%vol).gt.1.d-10).or.ii.ne.nmat.or.is.ne.xct%nspin) then
        call die(' Parameter mismatch in old file eps2_moments')
      endif
      
      read(ith) (mmts%an(ii),ii=1,n0)
      read(ith) (mmts%bn(ii),ii=1,n0)
      
      read(ith) (s1(ii),ii=1,nmat)
      read(ith) (s0(ii),ii=1,nmat)
      call close_file(ith)
      write(6,*) 'Reading old data from file eps2_moments'
      sum = (4.d0*PI_D*ryd)**2*mmts%norm*mmts%an(1)/(mmts%vol*xct%nspin)
      write(6,*) ' Sum rule (excitons) : ',sum,' eV^2'
      write(6,*) '      n         a(n)        b(n)'
      do ih=1,n0
        write(6,120) ih,mmts%an(ih),mmts%bn(ih)
      enddo
    endif
#ifdef MPI
    call MPI_BCAST(n0, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, mpierr)
    call MPI_BCAST(mmts%norm, 1, MPI_REAL_DP, 0, MPI_COMM_WORLD, mpierr)
    call MPI_BCAST(mmts%an, mmts%nmaxp, MPI_REAL_DP, 0, MPI_COMM_WORLD, mpierr)
    call MPI_BCAST(mmts%bn, mmts%nmaxp, MPI_REAL_DP, 0, MPI_COMM_WORLD, mpierr)
    call MPI_BCAST(s1, nmat, MPI_SCALAR, 0, MPI_COMM_WORLD, mpierr)
    call MPI_BCAST(s0, nmat, MPI_SCALAR, 0, MPI_COMM_WORLD, mpierr)
#endif

  else
    n0=1

!-------------------------
! Calculate the velocity (or momentum) matrix elements.
! Each PE calculates a small number of them. At the end, share data.
!
! If flag%vm.eq.0, skip this part and just read the matrix elements
! from "vmtxel".

  call logit('Calling v/p matrix elements')
  if (flag%vm.eq.0) then
    if (flag%opr .eq. 2) then
      seed =5000
      call genrand_init(put=seed)
      ! Just set the s1 vector to be random numbers on all processors 
      ! The seed has to be the same on all processors otherwise we need
      ! to set this on one and broadcast..
      call mtxel_jdos(s1,nmat)
    else

      do ikt=1, peinf%ikt(peinf%inode+1)
        ik = peinf%ik(peinf%inode+1,ikt)
        ikq = indexq_fi(ik)
        ikrq = kg_fi%indr(ik)
      
        call genwf(crys,gvec,kg_fi,syms,wfnc_fi,xct,ik,ik,work,intwfnc, is_cond = .true.)
      
        call genwf(crys,gvec,kgq_fi,syms,wfnvq_fi,xct,ik,ikq,workq,intwfnv, is_cond = .false.)
        if (flag%opr.eq.0) then
          call mtxel_v(wfnc_fi,wfnvq_fi,gvec,xct%qshift,wfnc_fi%nband,wfnvq_fi%nband,s1k)
        elseif (flag%opr.eq.1) then
          call mtxel_m(crys,wfnc_fi,wfnvq_fi,gvec,eqp,xct,wfnc_fi%nband,wfnvq_fi%nband,s1k,ik,.true.)
        endif
        do is=1,xct%nspin
          do ic=1,xct%ncband
            do iv=1,xct%nvband
              ikcvs= is + (iv - 1 + (ic - 1 + (ik - 1)*xct%ncband)*xct%nvband)*xct%nspin
              s1(ikcvs) = s1k(ic,iv,is)
            enddo
          enddo
        enddo
        SAFE_DEALLOCATE_P(wfnc_fi%cg)
        SAFE_DEALLOCATE_P(wfnc_fi%isort)
        SAFE_DEALLOCATE_P(wfnvq_fi%cg)
        SAFE_DEALLOCATE_P(wfnvq_fi%isort)
      enddo

      ! typedefs initializes all of these ikolds to 0
      if(work%ikold /= 0) then
        SAFE_DEALLOCATE_P(work%cg)
        SAFE_DEALLOCATE_P(work%isort)
        SAFE_DEALLOCATE_P(work%ph)
        SAFE_DEALLOCATE_P(work%ind)
      endif
      if(workq%ikold /= 0) then
        SAFE_DEALLOCATE_P(workq%cg)
        SAFE_DEALLOCATE_P(workq%isort)
        SAFE_DEALLOCATE_P(workq%ph)
        SAFE_DEALLOCATE_P(workq%ind)
      endif

! Share matrix elements

#ifdef MPI
      SAFE_ALLOCATE(dummy, (nmat))
      dummy = s1
      call MPI_ALLREDUCE(dummy,s1,nmat,MPI_SCALAR,MPI_SUM, MPI_COMM_WORLD,mpierr)
      SAFE_DEALLOCATE(dummy)
#endif

    endif ! Whether jdos or (velocity/momentum)

    if (peinf%inode.eq.0) then
      write(6,*) ' writing matrix elements into vmtxel'
      call open_file(16,file='vmtxel',form='unformatted',status='replace')
      write(16) xct%nkpt,xct%ncband,xct%nvband,xct%nspin,flag%opr
      write(16) (s1(ikcvs),ikcvs=1,nmat)
      call close_file(16)
    endif

  else  ! ...if the matrix elements were calculated already

    if (peinf%inode.eq.0) then
      write(6,*) ' reading matrix elements from vmtxel'
      call open_file(16,file='vmtxel',form='unformatted',status='old')
      read(16) ik,ic,iv,is,ii
      if (ik.ne.xct%nkpt.or.ic.ne.xct%ncband.or.iv.ne.xct%nvband &
        .or.is.ne.xct%nspin.or.ii.ne.flag%opr) then
        write(tmpstr,*) 'parameter mismatch in vmtxel ', &
          ik,ic,iv,is,ii,xct%nkpt,xct%ncband,xct%nvband,xct%nspin,flag%opr
        call die(tmpstr)
      endif
      
      read(16) (s1(ikcvs),ikcvs=1,nmat)
      call close_file(16)
    endif
#ifdef MPI
    call MPI_BCAST(s1,nmat,MPI_SCALAR,0,MPI_COMM_WORLD,mpierr)
#endif

  endif
  SAFE_DEALLOCATE(s1k)
  SAFE_DEALLOCATE(indexq_fi)

! JRD: Now close the no longer needed wavefunction files

  if (xct%iwriteint.eq.1) then
    SAFE_DEALLOCATE_P(intwfnc%cgk)
    SAFE_DEALLOCATE_P(intwfnv%cgk)
    SAFE_DEALLOCATE_P(intwfnc%isort)
    SAFE_DEALLOCATE_P(intwfnv%isort)
  endif
  
  if (xct%iwriteint.eq.0 .and. (flag%vm.ne.1.or.flag%dtm.ne.1)) then
    write(filename,'(a,i4.4)') 'INT_VWFNQ_', peinf%inode
    itpv = 128+(2*peinf%inode)+2
    call open_file(itpv, filename, status = 'old')
    call close_file(itpv, delete = .true.) ! files INT_VWFNQ_*

    write(filename,'(a,i4.4)') 'INT_CWFN_', peinf%inode
    itpc = 128+(2*peinf%inode)+1
    call open_file(itpc, filename, status = 'old')
    call close_file(itpc, delete = .true.) ! files INT_CWFN_*
  endif

!-------------------------
! Calculate the non-interacting spectrum. Only one PE works

  call logit('Calling absp0')

  if (peinf%inode.eq.0) call absp0(eqp,xct,s1,vol,omega_plasma,flag)

  endif

!-------------------------
! Read head of dielectric function from file (either 'epsdiag.dat' or
! 'eps0mat'/'epsmat')

  call logit('Calling epsdiag')
  if (peinf%inode.eq.0) &
    call epsdiag(crys,gvec,syms,epsi,xct,flag%eps)

#ifdef MPI
  call MPI_BARRIER(MPI_COMM_WORLD,mpierr)
  call MPI_BCAST(xct%q0vec,3,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,mpierr)
  call MPI_BCAST(epsi%nq,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpierr)
  call MPI_Bcast(xct%epshead,1,MPI_SCALAR,0,MPI_COMM_WORLD,mpierr)
  call MPI_BCAST(epsi%emax,1,MPI_REAL_DP,0,MPI_COMM_WORLD,mpierr)
  if(peinf%inode.ne.0) then
    SAFE_ALLOCATE(epsi%eps, (epsi%nq))
  endif
  call MPI_BCAST(epsi%eps, epsi%nq,MPI_REAL_DP,0,MPI_COMM_WORLD,mpierr)
  if(peinf%inode.ne.0) then
    SAFE_ALLOCATE(epsi%q, (3,epsi%nq))
  endif
  call MPI_BCAST(epsi%q,3*epsi%nq,  MPI_REAL_DP,0,MPI_COMM_WORLD,mpierr)
  call MPI_BARRIER(MPI_COMM_WORLD,mpierr)
#endif

!---------------------------
! Build Hamiltonian matrix. Diagonal part comes from the "non-interacting"
! quasiparticle Hamiltonians.  If the QP Greens func is diagonal,
! then these are just the quasiparticle energy differences on the
! diagonal.  The more general case is:
!
!       <cvk|H0|c'v'k'> = delta(k,k') *
!            [ <c|hqp|c'>*delta(v',v) - delta(c,c')*<v'|hqp|v> ]
!
! The rest of the Hamiltonian, which is the elelctron-hole interaction,
! comes from interplation further below.

  call logit('Building non-interacting Hamiltonian')
  SAFE_ALLOCATE(hmtrx, (xct%nkpt*nblock,peinf%nblocks*peinf%nblockd))
  hmtrx(:,:) = 0.d0

! Loop over kpoints

  do ikt=1,peinf%ibt(peinf%inode + 1)
    ik=peinf%ikb(peinf%inode+1,ikt)

! Build <c|hqp|c'> and <v|hqp|v'> for this kpoint

    SAFE_ALLOCATE(hqpcc, (xct%ncband,xct%ncband,xct%nspin))
    SAFE_ALLOCATE(hqpvv, (xct%nvband,xct%nvband,xct%nspin))
    hqpcc = 0.0d0
    hqpvv = 0.0d0

! Put QP energies on diagonals of hqpcc and hqpvv to start

    do is=1,xct%nspin
      do ic=1,xct%ncband
        hqpcc(ic,ic,is) = eqp%ecqp(ic,ik,is)
      enddo
      do iv=1,xct%nvband
        hqpvv(iv,iv,is) = eqp%evqp(iv,ik,is)
      enddo
    enddo

! Read possible offdiagonal QP elements from "hqp.<ik>" file
! if it exists JRD: This is broken for now.  Someone should fix
! it in the future if they want to use it

        !if (ik.lt.10) then
        !  write(tmpstr,'(a,i1)') 'hqp.',ik
        !else if (ik.lt.100) then
        !  write(tmpstr,'(a,i2)') 'hqp.',ik
        !else if (ik.lt.1000) then
        !  write(tmpstr,'(a,i3)') 'hqp.',ik
        !else if (ik.lt.10000) then
        !  write(tmpstr,'(a,i4)') 'hqp.',ik
        !else
        !  write(0,*) 'too many kpoints for reading hqp'
        !endif
        !call open_file(9,file=tmpstr,form='formatted',status='old',iostat=is)
        !if (is.eq.0) then
        !  if (peinf%inode.eq.0) then
        !    write(6,*) 'Reading offdiagonal hqp from file ',tmpstr
        !    write(6,*) 'All values in eV'
        !  endif
        !  do
        !    read(9,*,end=999) nocc,ii,jj,x,y
!
!! if ii and jj both refer to valence, states, put
!! matrix element into hqpvv
!
!            if ((ii<=nocc).and.(ii>nocc-xct%nvband).and. &
!            (jj<=nocc).and.(jj>nocc-xct%nvband)) then
!              if (peinf%inode.eq.0) write(6,'(a,2i5,2f20.10)') ' hqp(v,vp) = ',ii,jj,x,y
!              ii=nocc-ii+1
!              jj=nocc-jj+1
!              is = 1
!#ifdef CPLX
!              hqpvv(ii,jj,is) = CMPLX(x,y)/ryd
!#else
!              hqpvv(ii,jj,is) = x/ryd
!#endif
!            else if ((ii>nocc).and.(ii<=nocc+xct%ncband).and. &
!            (jj>nocc).and.(jj<=nocc+xct%ncband)) then
!              if (peinf%inode.eq.0) write(6,'(a,2i5,2f20.10)') ' hqp(c,cp) = ',ii,jj,x,y
!              ii=ii-nocc
!              jj=jj-nocc
!              is = 1
!#ifdef CPLX
!              hqpcc(ii,jj,is) = CMPLX(x,y)/ryd
!#else
!              hqpcc(ii,jj,is) = x/ryd
!#endif
!            endif
!          enddo
! 999      call close_file(9)
!          write(6,*)
!        endif ! if hqp.<ik> was found

! Now build hamiltonian from hqcc and hqvv

        ! Now build hamiltonian from hqcc and hqvv

    do is=1,xct%nspin
      do iv=1,xct%nvband
        do ic=1,xct%ncband
          !do icp=1,xct%ncband
          ikb=peinf%ikb(peinf%inode+1,ikt)
          if (xct%ipar .eq. 1) then
            ikcvs=is+(iv-1+(ic-1+(ikt-1)*xct%ncband)* &
              xct%nvband)*xct%nspin
            ivb=iv
            icb=ic
            ikcvst=is+(ivb-1+(icb-1+(ikb-1)*xct%ncband)*xct%nvband)*xct%nspin
            hmtrx(ikcvst,ikcvs) = hqpcc(icb,icb,is) - hqpvv(ivb,ivb,is)
          else if (xct%ipar .eq. 2 .and. ic .eq. 1) then
            ikcvs=is+((iv-1+(ikt-1)*xct%nvband))*xct%nspin
            icb=peinf%icb(peinf%inode+1,ikt)
            ivb=iv
            ikcvst=is+(ivb-1+(icb-1+(ikb-1)*xct%ncband)*xct%nvband)*xct%nspin
            hmtrx(ikcvst,ikcvs) = hqpcc(icb,icb,is) - hqpvv(ivb,ivb,is)
          else if (xct%ipar .eq. 3 .and. iv .eq. 1 .and. ic .eq. 1) then
            ikcvs=is+(((ikt-1)))*xct%nspin
            ivb=peinf%ivb(peinf%inode+1,ikt)
            icb=peinf%icb(peinf%inode+1,ikt)
            ikcvst=is+(ivb-1+(icb-1+(ikb-1)*xct%ncband)*xct%nvband)*xct%nspin
            hmtrx(ikcvst,ikcvs) = hqpcc(icb,icb,is) - hqpvv(ivb,ivb,is)
          endif
          !enddo
        enddo
      enddo

          !do ic=1,xct%ncband
          !  do iv=1,xct%nvband
          !    do ivp=1,xct%nvband
          !      ikcvs=is+(iv-1+(ic-1+(ik-1)*xct%ncband)* &
          !      xct%nvband)*xct%nspin
          !      ikcvst=is+(ivp-1+(ic-1+(ikt-1)*xct%ncband)* &
          !      xct%nvband)*xct%nspin
          !      hmtrx(ikcvs,ikcvst) = hmtrx(ikcvs,ikcvst) - &
          !      hqpvv(ivp,iv,is)
          !    enddo
          !  enddo
          !enddo
    enddo
    SAFE_DEALLOCATE(hqpcc)
    SAFE_DEALLOCATE(hqpvv)
  enddo ! loop on k-points on this processor

!#ifdef VERBOSE
!      call logit('Non-int Hamiltonian, node 0, kpt (0,0):', &
!      peinf)
!      if (peinf%inode.eq.0) then
!        ikt=peinf%ik(peinf%inode+1,1)
!        ik=1
!        do ic=1,min(5,xct%ncband)
!          do iv=1,xct%nvband
!            do icp=1,min(5,xct%ncband)
!              do ivp=1,xct%nvband
!                is = 1
!                ikcvs=is+(iv-1+(ic-1+(ik-1)*xct%ncband)* &
!                xct%nvband)*xct%nspin
!                ikcvst=is+(ivp-1+(icp-1+(ikt-1)*xct%ncband)* &
!                xct%nvband)*xct%nspin
!                write(6,'(4i5,2f15.8)') &
!                ic,iv,icp,ivp,hmtrx(ikcvs,ikcvst)
!              enddo
!            enddo
!          enddo
!        enddo
!      endif
!#endif

!-----------------------------------------------------------------------
! Define the mapping of eigenvectors: the ii-th column of the matrix
! egs(:,:) stored in PE #ipe will contain the eigenvector of order
! peinf%peig(ipe,ii). The total number of eigenvectors stored in
! each processor is given by peinf%neig(1:peinf%npes).
!  pblock >= maxval(peinf%neig(1:peinf%npes))
! NOTE: this is taken from diag code, but it also holds for the
! mapping of hmtrx, that has the same layout as egs (Murilo)

  SAFE_ALLOCATE(peinf%neig, (peinf%npes))
  SAFE_ALLOCATE(peinf%peig, (peinf%npes,peinf%nblocks*peinf%nblockd))
  peinf%neig=0
  peinf%peig=0
  ii=1
  do jj=1,nmat
    if (ii.eq.peinf%npes+1) ii=1
    peinf%neig(ii)=peinf%neig(ii)+1
    peinf%peig(ii,peinf%neig(ii))=jj
    if (mod(jj,peinf%nblockd).eq.0) ii=ii+1
  enddo

!---------------------------------
! Initialize local states : s1_l  --> local part of s1
!                           s0_l  --> local part of s0
! Whenever s1 and s0 change, their local parts must be updated

  SAFE_ALLOCATE(s1_l, (peinf%nblocks*peinf%nblockd))
  !MJ : CHANGE THIS

  call local_s(nmat,s1,s1_l)
  SAFE_ALLOCATE(s0_l, (peinf%nblocks*peinf%nblockd))
  call local_s(nmat,s0,s0_l)

!--------------------------------
! Interpolation scheme in the Kernel

  call logit('Calling intkernel')
  call timacc(5,1,tsec)
  call intkernel(crys,kg_fi,kp_fi,epsi,xct,hmtrx,dcc,dvv,kco,imap,flag%krnl,eqp,gvec)
  call timacc(5,2,tsec)
  SAFE_DEALLOCATE(imap)
  SAFE_DEALLOCATE(dcc)
  SAFE_DEALLOCATE(dvv)
  SAFE_DEALLOCATE(kco)

#ifdef VERBOSE
  call logit('Interacting Hamiltonian, node 0, kpt (0,0):')
!      if (peinf%inode.eq.0) then
!        ikt=peinf%ik(peinf%inode+1,1)
!        ik=1
!        do ic=1,min(5,xct%ncband)
!          do iv=1,xct%nvband
!            do icp=1,min(5,xct%ncband)
!              do ivp=1,xct%nvband
!                is = 1
!                ikcvs=is+(iv-1+(ic-1+(ik-1)*xct%ncband)* &
!                xct%nvband)*xct%nspin
!                ikcvst=is+(ivp-1+(icp-1+(ikt-1)*xct%ncband)* &
!                xct%nvband)*xct%nspin
!                write(6,'(4i5,2f15.8)') &
!                ic,iv,icp,ivp,hmtrx(ikcvs,ikcvst)
!              enddo
!            enddo
!          enddo
!        enddo
!      endif
#endif

!------------------------------
! Normalize s1 to get the first order Haydock state: |1>= (e.v) |Ground>
! Calculate the first a coefficient

  if (flag%vm .ne. 2) then

    mmts%norm =  dble( DOT_PRODUCT(s1,s1) )
    sum = sqrt(mmts%norm)
    s1(:) = s1(:)/sum
    call local_s(nmat,s1,s1_l)
  
    sum1 = dble( DOT_PRODUCT(s1,MATMUL(hmtrx,s1_l)) )
#ifdef MPI
    call MPI_Allreduce(sum1,sum,1,MPI_REAL_DP,MPI_SUM,MPI_COMM_WORLD,mpierr)
    sum1 = sum
#endif
    mmts%an(1) = sum1

#ifdef MPI
    call MPI_BARRIER(MPI_COMM_WORLD,mpierr)
#endif

!--------------------------
! Calculate the second order state, s0, and the first b coefficient

    s0 = MATMUL(hmtrx,s1_l)

#ifdef MPI
    SAFE_ALLOCATE(dummy, (nmat))
    dummy = s0
    call MPI_Allreduce(dummy,s0,nmat,MPI_SCALAR,MPI_SUM,MPI_COMM_WORLD,mpierr)
    SAFE_DEALLOCATE(dummy)
#endif

    s0(:) = s0(:) - s1(:)*mmts%an(1)
    sum = dble( DOT_PRODUCT(s0,s0) )
    mmts%bn(1) = sum
    s0(:) = s0(:)/sqrt(sum)
    call local_s(nmat,s0,s0_l)

  endif

  if (peinf%inode.eq.0) then
    write(6,750) mmts%nmax
750 format(/,'Performing Haydock recursion with ',i5,' steps. ')
    write(6,*) 'Norm of first state : ',mmts%norm
    
! Check plasmon sum rule
!
! Exact value of sum rule:
! sum = (pi/2.d0)*( plasma frequency )^2

    sum = (4.d0*PI_D*ryd)**2*mmts%norm*mmts%an(1)/ &
      (mmts%vol*xct%nspin)
    write(6,*) ' Sum rule (excitons) : ',sum,' eV^2'
    write(6,*) '      n         a(n)        b(n)'
    write(6,120) n0,mmts%an(n0),mmts%bn(n0)
  endif

#ifdef MPI
  call MPI_BARRIER(MPI_COMM_WORLD,mpierr)
#endif

!-----------------------------------------------------------------------
! Start Haydock recursion method. At this point, n0 is always an odd number
! After each pair of iterations, we have the states:
!              | ih+1 >  ---  s1
!              | ih+2 >  ---  s0
! Lower states are lost.

  call timacc(6,1,tsec)
  do ih=n0+1,mmts%nmax,2
    call iterate(mmts,xct,nmat,ih,hmtrx,s1,s0)
  enddo
  call timacc(6,2,tsec)
  SAFE_DEALLOCATE(hmtrx)
  SAFE_DEALLOCATE(s1)
  SAFE_DEALLOCATE(s0)

!---------------------------
! Calculate absorption spectrum using Haydock recursion

  endif

  if (peinf%inode.eq.0) call absh(mmts,xct%nspin,xct%eta)
  SAFE_DEALLOCATE_P(mmts%an)
  SAFE_DEALLOCATE_P(mmts%bn)
  SAFE_DEALLOCATE_P(eqp%eclda)
  SAFE_DEALLOCATE_P(eqp%evlda)

  if (xct%iwritecoul .eq. 1 .and. peinf%inode .eq. 0) then
    call close_file(19) ! file vcoul
  endif

#ifdef MPI
  call MPI_BARRIER(MPI_COMM_WORLD,mpierr)
#endif
  if (eqp%spl_tck%n>0) then
    SAFE_DEALLOCATE_P(eqp%spl_tck%t)
    SAFE_DEALLOCATE_P(eqp%spl_tck%c)
  endif

!----------------------------
! Time accounting

  ntim=6
  SAFE_ALLOCATE(routnam, (60))
  routnam(1)='TOTAL:'
  routnam(2)='INPUT:'
  routnam(3)='INPUT_Q:'
  routnam(4)='INTWFN:'
  routnam(5)='INTKERNEL:'
  routnam(6)='ITERATE:'

  call timacc(1,2,tsec)
  if(peinf%inode.eq.0) then
    write(6,*)
    write(6,9000) 'CPU [s]','WALL [s]','#'
    write(6,*)
  endif
  
  do ii=2,ntim
    call timacc(ii,3,tsec,ncount)
#ifdef MPI
    call MPI_Reduce(tsec,tmin,2,MPI_REAL_DP,MPI_MIN,0,MPI_COMM_WORLD,mpierr)
    call MPI_Reduce(tsec,tmax,2,MPI_REAL_DP,MPI_MAX,0,MPI_COMM_WORLD,mpierr)
#else
    tmin=tsec
    tmax=tsec
#endif
    if(peinf%inode.eq.0) then
      write(6,9001) routnam(ii),tmin(1),tmin(2),ncount
      write(6,9002) tsec(1),tsec(2)
      write(6,9003) tmax(1),tmax(2)
    endif
  enddo
  if(peinf%inode.eq.0) write(6,*)
    
  routnam(51)='IK Setup:'
  routnam(52)='IK C-Check:'
  routnam(53)='IK Input:'
  routnam(54)='IK Inteps:'
  routnam(55)='IK Vcoul:'
  routnam(56)='IK Denom:'
  routnam(57)='IK Interp:'
  routnam(58)='IK Sum:'
    
  do ii=51,58
    call timacc(ii,3,tsec,ncount)
#ifdef MPI
    call MPI_Reduce(tsec,tmin,2,MPI_REAL_DP,MPI_MIN,0,MPI_COMM_WORLD,mpierr)
    call MPI_Reduce(tsec,tmax,2,MPI_REAL_DP,MPI_MAX,0,MPI_COMM_WORLD,mpierr)
#else
    tmin=tsec
    tmax=tsec
#endif
    if(peinf%inode.eq.0) then
      write(6,9001) routnam(ii),tmin(1),tmin(2),ncount
      write(6,9002) tsec(1),tsec(2)
      write(6,9003) tmax(1),tmax(2)
    endif
  enddo
    
  if(peinf%inode.eq.0) write(6,*)
  
  call timacc(1,3,tsec,ncount)

#ifdef MPI
  call MPI_Reduce(tsec,tmin,2,MPI_REAL_DP,MPI_MIN,0,MPI_COMM_WORLD,mpierr)
  call MPI_Reduce(tsec,tmax,2,MPI_REAL_DP,MPI_MAX,0,MPI_COMM_WORLD,mpierr)
#else
  tmin=tsec
  tmax=tsec
#endif

  if(peinf%inode.eq.0) then
    write(6,9004) routnam(1),tmin(1),tmin(2)
    write(6,9002) tsec(1),tsec(2)
    write(6,9003) tmax(1),tmax(2)
    write(6,*)
  endif

9000 format(17x,a13,  3x,a13,  3x,a8)
9001 format(1x,a11,'(min.)',f13.3,3x,f13.3,3x,i8)
9002 format(  12x,'(PE 0)',f13.3,3x,f13.3)
9003 format(  12x,'(max.)',f13.3,3x,f13.3)
9004 format(1x,a11,'(min.)',f13.3,3x,f13.3)

  call write_memory_usage()

#ifdef MPI
  call MPI_FINALIZE(mpierr)
#endif

end subroutine haydock
