!==============================================================================
!
! Modules:
!
! chi_summation_m                               Last Modified: 04/19/2012 (FHJ)
!
!   This module creates the (distributed) polarizability matrix chi by summing
!   the (distributed) pol%gme matrices. There are routines that communicate the
!   gme matrix using either matrix or element partitioning scheme.
!
!==============================================================================

#include "f_defs.h"

module chi_summation_m

  use global_m
  use blas_m
  use mtxelmultiply_m
  use scalapack_m
  use lin_denominator_m
  use io_utils_m

  implicit none

  private

  public :: create_chi_summator, free_chi_summator,&
    chi_summation_comm_matrix, chi_summation_comm_elements

  !> FHJ: the chi_summator "object"
  type chi_summator_t 
    real(DP) :: fact
    !> DWV: below are some temporary buffers needed for the chi summation. They are
    !! described in detail in this comment.
    !! gme = g-vector matrix element
    !! gmetempX where X = n,r,c are temporary arrays for static calculations
    !! n = normalized by the proper factor used in BGW
    !! r = row, meaning the matrix elements for all bands (nv*nc*nk) that the proc owns
    !! for the G-vectors in the rows of chi currently being computed
    !! c = column, the matrix elements for all bands (nv*nc*nk) that the proc owns
    !! for the G`-vectors in the rows of chi currently being computed
    !! the RDyn arrays are needed for full-frequency (FF) calculations, real and complex
    !! while the Adyn arrays are needed only for complex FF calculations
    !! r2 is used in FF with matrix communication because you have a different denominator for 
    !! each frequency. Only the r2 array (not the r) array is used for element communication
    !! the denominators are built into the gme`s for static calculations
    !! eden arrays hold the energy denominators for FF
    !! chilocal holds the contribution of a given processor to the GG` chunk of epsilon
    !! being computed
    !! chilocal2 holds the MPI reduced GG` chunk of epsilon being computed 
    SCALAR, allocatable :: chilocal(:,:)
    SCALAR, allocatable :: chilocal2(:,:,:)
    complex(DPC), allocatable :: chilocalRDyn(:,:,:)
    complex(DPC), allocatable :: chilocal2RDyn(:,:,:,:)
    complex(DPC), allocatable :: chiRDyntmp(:)
    SCALAR, allocatable :: gmetempr(:,:),gmetempc(:,:)
    SCALAR, allocatable :: gmetempn(:)
    complex(DPC), allocatable :: gmeRDyntempn(:)
    complex(DPC), allocatable :: gmeRDyntempr2(:,:,:)
    complex(DPC), allocatable :: gmeRDyntempc(:,:)
    complex(DPC), allocatable :: gmeRDyntempcs(:,:)

#ifdef CPLX
    complex(DPC), allocatable :: chiADyntmp(:)
    complex(DPC), allocatable :: chilocalADyn(:,:,:)
    complex(DPC), allocatable :: chilocal2ADyn(:,:,:,:)
    complex(DPC), allocatable :: gmeADyntempn(:)
    complex(DPC), allocatable :: gmeADyntempr2(:,:,:)
    complex(DPC), allocatable :: gmeADyntempc(:,:) 
#endif
    integer, allocatable :: deltaCount(:,:)
    integer, allocatable :: deltaCountReduce(:,:)
  end type chi_summator_t

  public :: chi_summator_t

contains

  subroutine create_chi_summator(this, pol, fact)
    type(chi_summator_t), intent(INOUT) :: this !<the chi_summator_t object
    type(polarizability), intent(IN) :: pol
    real(DP), intent(IN) :: fact

    PUSH_SUB(create_chi_summator)

    this%fact = fact

    if (pol%freq_dep .eq. 0) then
      SAFE_ALLOCATE(this%gmetempn, (pol%nmtx))
    endif
    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)) then
      SAFE_ALLOCATE(this%gmeRDyntempn, (pol%nmtx))
      SAFE_ALLOCATE(this%chiRDyntmp, (pol%os_nfreq_para))
#ifdef CPLX
      SAFE_ALLOCATE(this%gmeADyntempn, (pol%nmtx))
      SAFE_ALLOCATE(this%chiADyntmp, (pol%os_nfreq_para))
#endif
    endif
    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
      SAFE_ALLOCATE(this%gmeRDyntempn, (pol%nmtx))
      SAFE_ALLOCATE(this%chiRDyntmp, (pol%os_nfreq_para))
    endif
    if (pol%freq_dep .eq. 3) then
      SAFE_ALLOCATE(this%gmeRDyntempn, (pol%nmtx))
      SAFE_ALLOCATE(this%chiRDyntmp, (pol%os_nfreq_para))
    endif

    POP_SUB(create_chi_summator)
    return

  end subroutine create_chi_summator

  subroutine free_chi_summator(this, pol)
    type(chi_summator_t), intent(INOUT) :: this !<the chi_summator_t object
    type(polarizability), intent(IN) :: pol

    PUSH_SUB(free_chi_summator)

    if (pol%freq_dep .eq. 0) then
      SAFE_DEALLOCATE(this%gmetempn)
    endif
    if (pol%freq_dep .eq. 2) then
      SAFE_DEALLOCATE(this%gmeRDyntempn)
      SAFE_DEALLOCATE(this%chiRDyntmp)
#ifdef CPLX
      SAFE_DEALLOCATE(this%gmeADyntempn)
      SAFE_DEALLOCATE(this%chiADyntmp)
#endif
    endif
    if (pol%freq_dep .eq. 3) then
      SAFE_DEALLOCATE(this%gmeRDyntempn)
      SAFE_DEALLOCATE(this%chiRDyntmp)
    endif

    POP_SUB(free_chi_summator)
    return

  end subroutine free_chi_summator

  !-----------------------------------------------------------------------------
  !                              GCOMM_MATRIX
  !-----------------------------------------------------------------------------

  !> Create the pol%chi matrix using gcomm_matrix sceheme
  subroutine chi_summation_comm_matrix(this,pol,scal,kp,kpq,vwfn,&
    nst,nrk,indt,pht)
    type(chi_summator_t), intent(INOUT) :: this
    type(polarizability), intent(INOUT) :: pol
    type(scalapack), intent(in) :: scal
    type(kpoints), intent(IN) :: kp,kpq
    type(valence_wfns), intent(IN) :: vwfn

    integer, intent(IN) :: nst(:)
    integer, intent(IN) :: nrk
    integer, intent(INOUT) :: indt(:,:,:)
    SCALAR,  intent(INOUT) :: pht(:,:,:)

    integer :: ntot_members(pol%os_para_freqs)
    integer :: icurr,ntot,ntot2,itot,ntotmax,ifreq_para
    integer :: ipe, ilimit, jj, iv, irk, it, ispin,im,mytot
    integer :: i_myband,tag,irank,grp_mtxel_start,tmp_iv,im_proc
    complex(DPC) :: negfact
    real(DP) :: zvalue, cv_energy
    type(cvpair_info) :: cvpair_temp
    integer, allocatable :: tmprowindex(:),tmpcolindex(:)
    SCALAR, allocatable :: tmprowph(:),tmpcolph(:)
    complex(DPC), allocatable :: gmeRDyntempr(:)
    complex(DPC), allocatable :: edenDRtemp(:)
#ifdef CPLX
    complex(DPC), allocatable :: edenDAtemp(:)
#endif

    ! frequency points for the spectral functions of the polarizability
    integer :: isfreql, isfreqr, nwarn
    real(DP) :: sfreql, sfreqr, wr, wl
    ! Hilbert transform coefficients
    complex(DPC) :: htwR(pol%nfreq,pol%nsfreq), htwA(pol%nfreq,pol%nsfreq)
    complex(DPC) :: c11,c12,c13,c14,c21,c22,c23,c24
    complex(DPC) :: cA11,cA12,cA13,cA14,cA21,cA22,cA23,cA24

    integer :: isf,nf,nsf
    real(DP) :: sf1,sf2
    real(DP) :: step1,step2,fqt,eta
    complex(DPC) :: c1,c2,j_dpc,cA1,cA2

    type(progress_info) :: prog_info

    PUSH_SUB(chi_summation_comm_matrix)

    !call alloc_summation_buffers(pol, this%fact)

    if (pol%freq_dep.eq.0) then
      SAFE_ALLOCATE(this%chilocal2, (scal%npr,scal%npc,kp%nspin))
      this%chilocal2=0
    endif

    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)) then
      SAFE_ALLOCATE(this%chilocal2RDyn, (scal%npr,scal%npc,pol%os_nfreq_para,kp%nspin))
      this%chilocal2RDyn=0
#ifdef CPLX
      SAFE_ALLOCATE(this%chilocal2ADyn, (scal%npr,scal%npc,pol%os_nfreq_para,kp%nspin))
      this%chilocal2ADyn=0
#endif
    endif

! At this moment Shishkin and Kresse method only works for gcomm_elements.
! call die in inread.f90
    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
      SAFE_ALLOCATE(this%chilocal2RDyn, (scal%npr,scal%npc,pol%os_nfreq_para,kp%nspin))
      this%chilocal2RDyn=0
    endif

    if (pol%freq_dep .eq. 3) then
      SAFE_ALLOCATE(this%chilocal2RDyn, (scal%npr,scal%npc,pol%os_nfreq_para,kp%nspin))
      this%chilocal2RDyn=0
    endif

    ntot=0
    ntot2=0

    ntot = peinf%nvownactual*peinf%ncownactual
    do irk = 1, nrk
      ntot2=ntot2 + nst(irk)
    enddo
    ntot=ntot*ntot2
    !-------------------------------------------------------------------
    ! Static Be Here


    if (pol%freq_dep .eq. 0) then

      call progress_init(prog_info, 'building polarizability matrix', 'processor', &
        peinf%npes)
      do ipe = 1, peinf%npes
        call progress_step(prog_info)
        SAFE_ALLOCATE(this%chilocal, (scal%nprd(ipe),scal%npcd(ipe)))
        this%chilocal=0D0
        SAFE_ALLOCATE(this%gmetempr, (scal%nprd(ipe),ntot))
        SAFE_ALLOCATE(this%gmetempc, (ntot,scal%npcd(ipe)))

        do ispin = 1 , kp%nspin

          call mtxelmultiply(scal,ntot,nrk,nst,this%fact,vwfn, &
            this%gmetempr,this%gmetempc,this%chilocal,pol%gme,pol,indt,pht,ipe,ispin)

          if (peinf%inode.eq.0) call timacc(14,1)

#ifdef MPI
          call MPI_Reduce(this%chilocal(1,1),this%chilocal2(1,1,ispin),scal%npcd(ipe)*scal%nprd(ipe),MPI_SCALAR, &
            MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr)

#else
          this%chilocal2(:,:,ispin)=this%chilocal(:,:)
#endif
          if (peinf%inode.eq.0) call timacc(14,2)                  
        enddo ! ispin

        SAFE_DEALLOCATE(this%chilocal)
        SAFE_DEALLOCATE(this%gmetempr)
        SAFE_DEALLOCATE(this%gmetempc)

      enddo ! ipe
      call progress_free(prog_info)

      do ispin =1, kp%nspin
        pol%chi(:,:,ispin) = this%chilocal2(:,:,ispin)
      enddo
      SAFE_DEALLOCATE(this%chilocal2)

    endif ! pol%freq_dep .eq. 0

    !-------------------------------------
    ! Full Frequency Be Here

    negfact = -1D0*this%fact

    if ( ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)) .or. pol%freq_dep .eq. 3 ) then
      grp_mtxel_start=peinf%inode !for non-parallel freq case
      if(pol%os_para_freqs .gt. 1) then
      endif

      call progress_init(prog_info, 'building polarizability matrix', 'processor', &
        peinf%npes_freqgrp)
      do ipe = 1, peinf%npes_freqgrp
        call progress_step(prog_info)
#ifdef VERBOSE
        if(peinf%inode.eq.0) then
          write(6,'(A,i8,6x,A,i8,A)') '### ipe=',ipe,'(npes=',peinf%npes,')'
        endif
#endif
#ifdef MPI
        call MPI_barrier(MPI_COMM_WORLD,mpierr)
#endif
        SAFE_ALLOCATE(this%gmeRDyntempr2, (scal%nprd(ipe),ntot,pol%os_nfreq_para))
        SAFE_ALLOCATE(this%gmeRDyntempc, (ntot,scal%npcd(ipe)))
        SAFE_ALLOCATE(this%chilocalRDyn, (scal%nprd(ipe),scal%npcd(ipe),pol%os_nfreq_para))
        this%chilocalRDyn=0
#ifdef CPLX
        if( pol%freq_dep .ne. 3) then
          SAFE_ALLOCATE(this%gmeADyntempr2, (scal%nprd(ipe),ntot,pol%os_nfreq_para))
          SAFE_ALLOCATE(this%gmeADyntempc, (ntot,scal%npcd(ipe)))
          SAFE_ALLOCATE(this%chilocalADyn, (scal%nprd(ipe),scal%npcd(ipe),pol%os_nfreq_para))
          this%chilocalADyn=0
        endif
#endif


        do ispin = 1 , kp%nspin

          itot = 0

          if (peinf%inode.eq.0) call timacc(31,1)

          SAFE_ALLOCATE(tmprowindex,(scal%nprd(ipe)))
          SAFE_ALLOCATE(tmpcolindex,(scal%npcd(ipe)))
          SAFE_ALLOCATE(tmprowph,(scal%nprd(ipe)))
          SAFE_ALLOCATE(tmpcolph,(scal%npcd(ipe)))

          do im=1,pol%os_para_freqs                          ! im labels which member of the mtxel comm group are you
            im_proc=peinf%rank_f+1+(im-1)*peinf%npes_freqgrp ! im_proc gives this mtxel comm group member`s global
            do irk = 1, nrk                                  ! proc number
              do it = 1, nst(irk)

                do icurr=1,scal%nprd(ipe)
                  tmprowindex(icurr) = indt(scal%imyrowd(icurr,ipe),it,irk)
                  tmprowph(icurr) = pht(scal%imyrowd(icurr,ipe),it,irk)
                enddo
                do icurr=1,scal%npcd(ipe)
                  tmpcolindex(icurr) = indt(scal%imycold(icurr,ipe),it,irk)
                  tmpcolph(icurr) = pht(scal%imycold(icurr,ipe),it,irk)
                enddo
                do iv = 1,vwfn%nband+pol%ncrit

                  tmp_iv = peinf%global_indexv(iv,im_proc)

                  if (peinf%does_it_ownv(iv,im_proc)) then
                    ilimit = peinf%global_ncown(im_proc)
                  else
                    ilimit = 0
                  endif
#ifdef CPLX
                  !$OMP PARALLEL private (mytot,zvalue,edenDRtemp,edenDAtemp,gmeRDyntempr,jj,icurr)
                  if( pol%freq_dep .ne. 3) then
                    SAFE_ALLOCATE(edenDAtemp, (pol%os_nfreq_para))
                  endif
#else
                  !$OMP PARALLEL private (mytot,zvalue,edenDRtemp,gmeRDyntempr,jj,icurr)
#endif
                  SAFE_ALLOCATE(gmeRDyntempr, (scal%nprd(ipe)))
                  SAFE_ALLOCATE(edenDRtemp, (pol%os_nfreq_para))
                  !$OMP DO
                  do i_myband = 1, ilimit
                    mytot = itot + i_myband
                    zvalue = pol%edenDyn(peinf%global_indexv(iv,im_proc),i_myband,ispin,irk,im)
                    if(pol%lin_denominator<TOL_Zero) then
                      !this is when the lin_denominator mode is not active.
                      do jj=1+peinf%rank_mtxel,pol%nfreq,pol%os_para_freqs
                        ifreq_para=(jj+pol%os_para_freqs-1)/pol%os_para_freqs
                        if (abs(zvalue) .gt. Tol_Zero) then
                          edenDRtemp(ifreq_para)= -0.5d0*( &
                            1d0/(zvalue-(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)+ &
                            1d0/(zvalue+(pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd))
                        else
                          edenDRtemp(ifreq_para)= 0D0
                        endif
                      enddo

                    endif 
#ifdef CPLX
                    if(pol%freq_dep.ne. 3) then
                      if(pol%lin_denominator<TOL_Zero) then
                        !this is when the lin_denominator mode is not active.
                        do jj=1+peinf%rank_mtxel,pol%nfreq,pol%os_para_freqs
                          ifreq_para=(jj+pol%os_para_freqs-1)/pol%os_para_freqs
                          if (abs(zvalue) .gt. Tol_Zero) then
                            edenDAtemp(ifreq_para)= -0.5d0*( &
                              1d0/(zvalue-(-pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd)+ &
                              1d0/(zvalue+(-pol%dFreqBrd(jj)+pol%dFreqGrid(jj))/ryd))
                          else
                            edenDAtemp(ifreq_para)= 0D0
                          endif
                        enddo

                      endif
                    endif
#endif

                    do icurr=1,scal%nprd(ipe)
                      gmeRDyntempr(icurr)=pol%gme(tmprowindex(icurr), &
                        i_myband,tmp_iv,ispin,irk,im) * tmprowph(icurr)
                    enddo
                    do jj = 1+peinf%rank_mtxel,pol%nfreq,pol%os_para_freqs
                      ifreq_para=(jj+pol%os_para_freqs-1)/pol%os_para_freqs
                      this%gmeRDyntempr2(:,mytot,ifreq_para)=gmeRDyntempr(:)*edenDRtemp(ifreq_para)
                    enddo
                    do icurr=1,scal%npcd(ipe)
                      this%gmeRDyntempc(mytot,icurr) = &
                        MYCONJG( pol%gme(tmpcolindex(icurr),i_myband,tmp_iv,ispin,irk,im) * tmpcolph(icurr) )
                    enddo
#ifdef CPLX
                    if( pol%freq_dep .ne. 3) then
                      do jj = 1+peinf%rank_mtxel,pol%nfreq,pol%os_para_freqs
                        ifreq_para=(jj+pol%os_para_freqs-1)/pol%os_para_freqs
! POSSIBLY BAD FOR OPENMP
                        this%gmeADyntempr2(:,mytot,ifreq_para)=gmeRDyntempr(:)*edenDAtemp(ifreq_para)
                      enddo
                      do icurr=1,scal%npcd(ipe)
! PROBABLY BAD FOR OPENMP
                        this%gmeADyntempc(mytot,icurr)= &
                          MYCONJG( pol%gme(tmpcolindex(icurr),i_myband,tmp_iv,ispin,irk,im) * tmpcolph(icurr) )
                      enddo
                    endif
#endif
                  enddo ! i_myband
                  !$OMP END DO
                  SAFE_DEALLOCATE(edenDRtemp)
#ifdef CPLX
                  if( pol%freq_dep .ne. 3) then
                    SAFE_DEALLOCATE(edenDAtemp)
                  endif
#endif           
                  SAFE_DEALLOCATE(gmeRDyntempr)
                  !$OMP END PARALLEL
                  itot = itot+ilimit
                enddo ! iv
              enddo ! it
            enddo ! irk
          enddo ! im

          SAFE_DEALLOCATE(tmprowindex)
          SAFE_DEALLOCATE(tmpcolindex)
          SAFE_DEALLOCATE(tmprowph)
          SAFE_DEALLOCATE(tmpcolph)

          if (peinf%inode.eq.0) call timacc(31,2)          

          !Do the zgemm`s
          if(ntot > 0) then
            do jj =1+peinf%rank_mtxel,pol%nfreq,pol%os_para_freqs
              ifreq_para=(jj+pol%os_para_freqs-1)/pol%os_para_freqs
              if (peinf%inode.eq.0) call timacc(30,1)
              call zgemm('n','n',scal%nprd(ipe),scal%npcd(ipe),ntot, &
                negfact,this%gmeRDyntempr2(:,:,ifreq_para),scal%nprd(ipe),this%gmeRDyntempc(:,:),ntot,&
                (0D0,0D0),this%chilocalRDyn(:,:,ifreq_para),scal%nprd(ipe))
              if (peinf%inode.eq.0) call timacc(30,2)
#ifdef CPLX
              if(pol%freq_dep .ne. 3) then
                if (peinf%inode.eq.0) call timacc(30,1)
                call zgemm('n','n',scal%nprd(ipe),scal%npcd(ipe),ntot, &
                  negfact,this%gmeADyntempr2(:,:,ifreq_para),scal%nprd(ipe),this%gmeADyntempc(:,:),ntot,&
                  (0D0,0D0),this%chilocalADyn(:,:,ifreq_para),scal%nprd(ipe))
                if (peinf%inode.eq.0) call timacc(30,2)
              endif
#endif
            enddo
          endif

          if (peinf%inode.eq.0) call timacc(14,1)

          if(pol%os_para_freqs .eq. 1) then            
#ifdef MPI
            call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), &
              pol%os_nfreq_para*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,&
              MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr)
#ifdef CPLX
            if(pol%freq_dep .ne. 3) then
              call MPI_Reduce(this%chilocalADyn(1,1,1),this%chilocal2ADyn(1,1,1,ispin), &
                pol%os_nfreq_para*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,&
                MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr)
            endif
#endif
#endif
          else
#ifdef MPI
            call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), &
              pol%os_nfreq_para*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,&
              MPI_SUM,ipe-1,peinf%freq_comm,mpierr)
#ifdef CPLX
            if(pol%freq_dep .ne. 3) then
              call MPI_Reduce(this%chilocalADyn(1,1,1),this%chilocal2ADyn(1,1,1,ispin), &
                pol%os_nfreq_para*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,&
                MPI_SUM,ipe-1,peinf%freq_comm,mpierr)
            endif
#endif
#endif
          endif
#ifndef MPI
          this%chilocal2RDyn(:,:,:,ispin)=this%chilocalRDyn(:,:,:)
#ifdef CPLX
          if(pol%freq_dep .ne. 3) then
            this%chilocal2ADyn(:,:,:,ispin)=this%chilocalADyn(:,:,:)
          endif
#endif

#endif
          if (peinf%inode.eq.0) call timacc(14,2)

        enddo ! ispin
        if (peinf%inode.eq.0) call timacc(44,1)
        SAFE_DEALLOCATE(this%chilocalRDyn)
        SAFE_DEALLOCATE(this%gmeRDyntempr2)
        SAFE_DEALLOCATE(this%gmeRDyntempc)
#ifdef CPLX
        if(pol%freq_dep .ne. 3) then
          SAFE_DEALLOCATE(this%chilocalADyn)
          SAFE_DEALLOCATE(this%gmeADyntempr2)
          SAFE_DEALLOCATE(this%gmeADyntempc)
        endif
#endif
        if (peinf%inode.eq.0) call timacc(44,2)
      enddo ! ipe
      call progress_free(prog_info)

      do ispin =1, kp%nspin
        do jj=1+peinf%rank_mtxel,pol%nfreq,pol%os_para_freqs
          ifreq_para=(jj+pol%os_para_freqs-1)/pol%os_para_freqs
          pol%chiRDyn(ifreq_para,:,:,ispin) = this%chilocal2RDyn(:,:,ifreq_para,ispin)
#ifdef CPLX
          if( pol%freq_dep .ne. 3) then
            pol%chiADyn(ifreq_para,:,:,ispin) = this%chilocal2ADyn(:,:,ifreq_para,ispin)
          endif
#endif
        enddo ! jj
      enddo ! ispin
      SAFE_DEALLOCATE(this%chilocal2RDyn)
#ifdef CPLX
      if(pol%freq_dep .ne. 3) then
        SAFE_DEALLOCATE(this%chilocal2ADyn)
      endif
#endif
    endif ! (pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)

    !-------------------------------------
    ! Full Frequency Be Here.
    ! M. Shishkin and G. Kresse, Implementation and performance of the 
    ! frequency-dependent GW method within the PAW framework,
    ! PHYSICAL REVIEW B 74, 035101, 2006.

    negfact = -1D0*this%fact

    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
      grp_mtxel_start=peinf%inode !for non-parallel freq case
      if(pol%os_para_freqs .gt. 1) then
      endif

      ! -------------------------------------------
      ! compute the Hilbert transform coefficients
      ! -------------------------------------------
      j_dpc=(0.0,1.0)
      htwR(:,:)=(0.0,0.0)

      nf=pol%nfreq
      nsf=pol%nsfreq
      do jj=1,nf
        eta=pol%dBrdning/ryd
        fqt=pol%dFreqGrid(jj)/ryd
        do isf=1,nsf
          if (isf==1) then
            c1=(0.0,0.0)
            step1=1.0
#ifdef CPLX
            cA1=(0.0,0.0)
            step1=1.0
#endif
          else
            sf1=pol%dSFreqGrid(isf-1)/ryd
            sf2=pol%dSFreqGrid(isf)/ryd
            step1=sf2-sf1

            c11=((sf1-fqt)*(-1.0*j_dpc)-eta)*(atan((fqt-sf2)/(eta))&
              &          -atan((fqt-sf1)/(eta)))
            c12=0.50*(sf1-fqt+(-1.0*j_dpc)*eta)*log(((fqt-sf2)**2.0+eta*eta)&
              &          /(((fqt-sf1)**2.0+eta*eta)))

            c13=-((sf1+fqt)*j_dpc-eta)*(atan((fqt+sf2)/(eta))&
              &          -atan((fqt+sf1)/(eta)))
            c14=0.50*(sf1+fqt+j_dpc*eta)*log(((fqt+sf2)**2.0+eta*eta)&
              &          /(((fqt+sf1)**2.0+eta*eta)))

            c1=c11+c12+c13+c14
            c1=c1/step1

#ifdef CPLX
            cA11=((sf1-fqt)*(j_dpc)-eta)*(atan((fqt-sf2)/(eta))&
              &          -atan((fqt-sf1)/(eta)))
            cA12=0.50*(sf1-fqt+(j_dpc)*eta)*log(((fqt-sf2)**2.0+eta*eta)&
              &          /(((fqt-sf1)**2.0+eta*eta)))

            cA13=-((sf1+fqt)*(-1.0*j_dpc)-eta)*(atan((fqt+sf2)/(eta))&
              &          -atan((fqt+sf1)/(eta)))
            cA14=0.50*(sf1+fqt+(-1.0*j_dpc)*eta)*log(((fqt+sf2)**2.0+eta*eta)&
              &          /(((fqt+sf1)**2.0+eta*eta)))

            cA1=cA11+cA12+cA13+cA14
            cA1=cA1/step1
#endif
          endif

          if (isf==nsf) then
            c2=(0.0,0.0)
            step2=1.0
#ifdef CPLX
            cA2=(0.0,0.0)
            step2=1.0
#endif
          else
            sf1=pol%dSFreqGrid(isf)/ryd
            sf2=pol%dSFreqGrid(isf+1)/ryd

            step2=sf2-sf1

            c21=((sf2-fqt)*(-1.0*j_dpc)-eta)*(atan((fqt-sf1)/(eta))&
              &          -atan((fqt-sf2)/(eta)))
            c22=0.50*(sf2-fqt+(-1.0*j_dpc)*eta)*log(((fqt-sf1)**2.0+eta*eta)&
              &          /(((fqt-sf2)**2.0+eta*eta)))

            c23=-((sf2+fqt)*j_dpc-eta)*(atan((fqt+sf1)/(eta))&
              &          -atan((fqt+sf2)/(eta)))
            c24=0.50*(sf2+fqt+j_dpc*eta)*log(((fqt+sf1)**2.0+eta*eta)&
              &          /(((fqt+sf2)**2.0+eta*eta)))

            c2=c21+c22+c23+c24

            c2=c2/step2

#ifdef CPLX
            cA21=((sf2-fqt)*(j_dpc)-eta)*(atan((fqt-sf1)/(eta))&
              &          -atan((fqt-sf2)/(eta)))
            cA22=0.50*(sf2-fqt+(j_dpc)*eta)*log(((fqt-sf1)**2.0+eta*eta)&
              &          /(((fqt-sf2)**2.0+eta*eta)))

            cA23=-((sf2+fqt)*(-1.0*j_dpc)-eta)*(atan((fqt+sf1)/(eta))&
              &          -atan((fqt+sf2)/(eta)))
            cA24=0.50*(sf2+fqt+(-1.0*j_dpc)*eta)*log(((fqt+sf1)**2.0+eta*eta)&
              &          /(((fqt+sf2)**2.0+eta*eta)))

            cA2=cA21+cA22+cA23+cA24

            cA2=cA2/step2
#endif
          endif

          if (isf==1.or.isf==nsf) then
            htwR(jj,isf)=0.5d0*(c1/step1+c2/step2)
#ifdef CPLX
            htwA(jj,isf)=0.5d0*(cA1/step1+cA2/step2)
#endif
          else
            htwR(jj,isf)=1.0d0*(c1+c2)/(step1+step2)
#ifdef CPLX
            htwA(jj,isf)=1.0d0*(cA1+cA2)/(step1+step2)
#endif
          endif

        enddo
      enddo

      ! ----------------------------------------------------               
      ! compute the spectral functions of the polarizability
      ! ----------------------------------------------------

      call progress_init(prog_info, 'building polarizability matrix', 'processor', &
        peinf%npes_freqgrp)

      nwarn=0
      do ipe = 1, peinf%npes_freqgrp
        call progress_step(prog_info)
#ifdef VERBOSE
        if(peinf%inode.eq.0) then
          write(6,'(A,i8,6x,A,i8,A)') '### ipe=',ipe,'(npes=',peinf%npes,')'
        endif
#endif
#ifdef MPI
        call MPI_barrier(MPI_COMM_WORLD,mpierr)
#endif

        SAFE_ALLOCATE(gmeRDyntempr, (scal%nprd(ipe)))
        SAFE_ALLOCATE(this%gmeRDyntempr2, (scal%nprd(ipe),ntot,pol%os_nsfreq_para))
        SAFE_ALLOCATE(this%gmeRDyntempc, (ntot,scal%npcd(ipe)))
        SAFE_ALLOCATE(this%chilocalRDyn, (scal%nprd(ipe),scal%npcd(ipe),pol%os_nsfreq_para))
        this%chilocalRDyn=0

        gmeRDyntempr=(0.0,0.0)
        this%gmeRDyntempr2=(0.0,0.0)
        this%gmeRDyntempc=(0.0,0.0)

        do ispin = 1 , kp%nspin

          itot = 0

          if (peinf%inode.eq.0) call timacc(31,1)

          SAFE_ALLOCATE(tmprowindex,(scal%nprd(ipe)))
          SAFE_ALLOCATE(tmpcolindex,(scal%npcd(ipe)))
          SAFE_ALLOCATE(tmprowph,(scal%nprd(ipe)))
          SAFE_ALLOCATE(tmpcolph,(scal%npcd(ipe)))

          do im=1,pol%os_para_freqs                          ! im labels which member of the mtxel comm group are you
            im_proc=peinf%rank_f+1+(im-1)*peinf%npes_freqgrp ! im_proc gives this mtxel comm group member`s global
            do irk = 1, nrk                                  ! proc number
              do it = 1, nst(irk)

                do icurr=1,scal%nprd(ipe)
                  tmprowindex(icurr) = indt(scal%imyrowd(icurr,ipe),it,irk)
                  tmprowph(icurr) = pht(scal%imyrowd(icurr,ipe),it,irk)
                enddo
                do icurr=1,scal%npcd(ipe)
                  tmpcolindex(icurr) = indt(scal%imycold(icurr,ipe),it,irk)
                  tmpcolph(icurr) = pht(scal%imycold(icurr,ipe),it,irk)
                enddo

                do iv = 1,vwfn%nband+pol%ncrit

                  tmp_iv = peinf%global_indexv(iv,im_proc)

                  if (peinf%does_it_ownv(iv,im_proc)) then
                    ilimit = peinf%global_ncown(im_proc)
                  else
                    ilimit = 0
                  endif

                  !$OMP PARALLEL private (mytot,zvalue,gmeRDyntempr,icurr, &
                  !$OMP                   isfreql,isfreqr,sfreql,sfreqr, &
                  !$OMP                   wl,wr,i_myband,jj)
                  !$OMP DO
                  do i_myband = 1, ilimit

                    zvalue = -pol%edenDyn(peinf%global_indexv(iv,im_proc),i_myband,ispin,irk,im)
                    if (abs(zvalue) .gt. Tol_Zero) then

                      mytot = itot + i_myband
                      isfreql=-1

                      do jj=pol%nsfreq,1,-1
                        if ((pol%dSFreqGrid(jj)/ryd)<zvalue) then
                          isfreql=jj
                          EXIT
                        endif
                      enddo

                      if (isfreql.eq.pol%nsfreq) then
                        nwarn=nwarn+1
                        if (nwarn==1.and.peinf%inode.eq.0) then
                          write(0,*) 'WARNING: for accuracy, sfrequency_high_cutoff should be '
                          write(0,*) 'larger than energy difference between highest unoccupied '
                          write(0,*) 'state and lowest occupied state.'
                        endif
                        cycle
                      endif

                      sfreql=pol%dSFreqGrid(isfreql)/ryd
                      isfreqr=isfreql+1
                      sfreqr=pol%dSFreqGrid(isfreqr)/ryd
                    
                      wr=  (zvalue-sfreql)/(sfreqr-sfreql)
                      wl= -(zvalue-sfreqr)/(sfreqr-sfreql)
                    
                      do icurr=1,scal%nprd(ipe)
                        gmeRDyntempr(icurr)=pol%gme(tmprowindex(icurr), &
                          i_myband,tmp_iv,ispin,irk,im) * tmprowph(icurr)
                      enddo
                    
                      this%gmeRDyntempr2(:,mytot,isfreql)=gmeRDyntempr(:)*wl
                      this%gmeRDyntempr2(:,mytot,isfreqr)=gmeRDyntempr(:)*wr
                    
                      do icurr=1,scal%npcd(ipe)
                        this%gmeRDyntempc(mytot,icurr) = &
                          MYCONJG( pol%gme(tmpcolindex(icurr),i_myband,tmp_iv,ispin,irk,im) * tmpcolph(icurr) )
                      enddo
                    endif
                  enddo ! i_myband
                  !$OMP END DO
                  !$OMP END PARALLEL
                  itot = itot+ilimit

                enddo ! iv
              enddo ! it
            enddo ! irk
          enddo ! im

          SAFE_DEALLOCATE(tmprowindex)
          SAFE_DEALLOCATE(tmpcolindex)
          SAFE_DEALLOCATE(tmprowph)
          SAFE_DEALLOCATE(tmpcolph)

          if (peinf%inode.eq.0) call timacc(31,2)          

          !Do the zgemm`s
          if(ntot > 0) then
            do jj =1+peinf%rank_mtxel, pol%nsfreq,pol%os_para_freqs

              if (peinf%inode.eq.0) call timacc(30,1)
              call zgemm('n','n',scal%nprd(ipe),scal%npcd(ipe),ntot, &
                negfact,this%gmeRDyntempr2(:,:,jj),scal%nprd(ipe),this%gmeRDyntempc(:,:),ntot,&
                (0D0,0D0),this%chilocalRDyn(:,:,jj),scal%nprd(ipe))
              if (peinf%inode.eq.0) call timacc(30,2)
            enddo
          endif

          if (peinf%inode.eq.0) call timacc(14,1)

          if(pol%os_para_freqs .eq. 1) then            
#ifdef MPI
            call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), &
              pol%os_nsfreq_para*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,&
              MPI_SUM,ipe-1,MPI_COMM_WORLD,mpierr)
#endif
          else
#ifdef MPI
            call MPI_Reduce(this%chilocalRDyn(1,1,1),this%chilocal2RDyn(1,1,1,ispin), &
              pol%os_nsfreq_para*scal%npcd(ipe)*scal%nprd(ipe),MPI_COMPLEX_DPC,&
              MPI_SUM,ipe-1,peinf%freq_comm,mpierr)
#endif
          endif
#ifndef MPI
          this%chilocal2RDyn(:,:,:,ispin)=this%chilocalRDyn(:,:,:)
#endif
          if (peinf%inode.eq.0) call timacc(14,2)

        enddo ! ispin
        if (peinf%inode.eq.0) call timacc(44,1)
        SAFE_DEALLOCATE(this%chilocalRDyn)
        SAFE_DEALLOCATE(gmeRDyntempr)
        SAFE_DEALLOCATE(this%gmeRDyntempr2)
        SAFE_DEALLOCATE(this%gmeRDyntempc)

        if (peinf%inode.eq.0) call timacc(44,2)
      enddo ! ipe
      call progress_free(prog_info)

      do ispin =1, kp%nspin
        do jj=1+peinf%rank_mtxel,pol%nsfreq,pol%os_para_freqs
          pol%chiTDyn(jj,:,:,ispin) = this%chilocal2RDyn(:,:,jj,ispin)
        enddo ! jj
      enddo ! ispin
      SAFE_DEALLOCATE(this%chilocal2RDyn)

      ! -----------------------------
      ! Hilbert transform
      ! ------------------------------

      call zgemm('n','n',pol%nfreq,scal%npr*scal%npc*kp%nspin,pol%os_nsfreq_para, &
        (-1D0,0D0),htwR(:,:),pol%nfreq,pol%chiTDyn(:,:,:,:),pol%os_nsfreq_para, &
        (0D0,0D0),pol%chiRDyn(:,:,:,:),pol%nfreq)

#ifdef CPLX
      call zgemm('n','n',pol%nfreq,scal%npr*scal%npc*kp%nspin,pol%os_nsfreq_para, &
        (-1D0,0D0),htwA(:,:),pol%nfreq,pol%chiTDyn(:,:,:,:),pol%os_nsfreq_para, &
        (0D0,0D0),pol%chiADyn(:,:,:,:),pol%nfreq)
#endif

    endif ! pol%freq_dep.eq.2.and.pol%freq_dep_method.eq.1

    !call free_summation_buffers(pol)

    POP_SUB(chi_summation_comm_matrix)
    return

  end subroutine chi_summation_comm_matrix

  !-----------------------------------------------------------------------------
  !                              GCOMM_ELEMENTS
  !-----------------------------------------------------------------------------

  !> Create the pol%chi matrix using gcomm_elements sceheme
  subroutine chi_summation_comm_elements(this,pol,scal,kp,vwfn,cwfn,&
    nst,nrk,indt,pht)
    type(chi_summator_t), intent(INOUT) :: this
    type(polarizability), intent(INOUT) :: pol
    type(scalapack), intent(in) :: scal
    type(kpoints), intent(IN) :: kp
    type(valence_wfns), intent(IN) :: vwfn
    type(conduction_wfns), intent(IN) :: cwfn

    integer, intent(IN) :: nst(:)
    integer, intent(IN) :: nrk
    integer, intent(INOUT) :: indt(:,:,:)
    SCALAR,  intent(INOUT) :: pht(:,:,:)

    integer :: icurr,ntot,itot
    integer :: iv, ic, irk, it, ispin

    SCALAR :: temp_gme
    integer, allocatable :: iowna(:)
    integer :: isend

    complex(DPC), allocatable :: edenDRtemp(:)
#ifdef CPLX
    complex(DPC), allocatable :: edenDAtemp(:)
#endif

    real(DP) :: zvalue
    ! frequency points for the spectral functions of the polarizability
    integer :: isfreql, isfreqr
    real(DP) :: sfreql, sfreqr, wr, wl
    ! Hilbert tranform coefficients
    complex(DPC) :: htwR(pol%nfreq,pol%nsfreq), htwA(pol%nfreq,pol%nsfreq)
    complex(DPC) :: c11,c12,c13,c14,c21,c22,c23,c24
    complex(DPC) :: cA11,cA12,cA13,cA14,cA21,cA22,cA23,cA24

    integer :: isf,nf,nsf,ifreq_para
    real(DP) :: sf1,sf2
    real(DP) :: step1,step2,fqt,eta
    complex(DPC) :: c1,c2,j_dpc,cA1,cA2

    integer :: ii, jj
    type(progress_info) :: prog_info

    integer :: nsftot, il, ir, n1, n2, n3, max_nv, nwarn
    integer, allocatable :: count_v(:), ind_v(:,:), ind_sf(:) 
   
    PUSH_SUB(chi_summation_comm_elements)

    !call alloc_summation_buffers(pol, this%fact)

    if (peinf%inode.eq.0) call timacc(44,1)

    SAFE_ALLOCATE(iowna, (vwfn%nband+pol%ncrit))

    if (pol%freq_dep .eq. 0) then
      SAFE_ALLOCATE(this%chilocal, (scal%npr,scal%npc))
    endif

    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)) then
      SAFE_ALLOCATE(this%chilocalRDyn, (scal%npr,scal%npc,pol%os_nfreq_para))
#ifdef CPLX
      SAFE_ALLOCATE(this%chilocalADyn, (scal%npr,scal%npc,pol%os_nfreq_para))
#endif
    endif

    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
      SAFE_ALLOCATE(this%chilocalRDyn, (scal%npr,scal%npc,pol%os_nsfreq_para))
  
      SAFE_ALLOCATE(count_v, (pol%os_nsfreq_para))
      SAFE_ALLOCATE(ind_sf, (pol%os_nsfreq_para))
    endif

    if (peinf%inode.eq.0) call timacc(44,2)

#ifdef VERBOSE
    call logit("Starting chi Sum")
#endif

    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
      ! -------------------------------------------
      ! compute the Hilbert transform coefficients
      ! -------------------------------------------

      if (peinf%inode.eq.0) call timacc(53,1)

      j_dpc=(0.0,1.0)
      htwR(:,:)=(0.0,0.0)

      nf=pol%nfreq
      nsf=pol%nsfreq
      do jj=1,nf
        eta=pol%dBrdning/ryd
        fqt=pol%dFreqGrid(jj)/ryd
        do isf=1,nsf
          if (isf==1) then
            c1=(0.0,0.0)
            step1=1.0
#ifdef CPLX
            cA1=(0.0,0.0)
            step1=1.0
#endif
          else
            sf1=pol%dSFreqGrid(isf-1)/ryd
            sf2=pol%dSFreqGrid(isf)/ryd
            step1=sf2-sf1

            c11=((sf1-fqt)*(-1.0*j_dpc)-eta)*(atan((fqt-sf2)/(eta))&
              &          -atan((fqt-sf1)/(eta)))
            c12=0.50*(sf1-fqt+(-1.0*j_dpc)*eta)*log(((fqt-sf2)**2.0+eta*eta)&
              &          /(((fqt-sf1)**2.0+eta*eta)))

            c13=-((sf1+fqt)*j_dpc-eta)*(atan((fqt+sf2)/(eta))&
              &          -atan((fqt+sf1)/(eta)))
            c14=0.50*(sf1+fqt+j_dpc*eta)*log(((fqt+sf2)**2.0+eta*eta)&
              &          /(((fqt+sf1)**2.0+eta*eta)))

            c1=c11+c12+c13+c14
            c1=c1/step1

#ifdef CPLX
            cA11=((sf1-fqt)*(j_dpc)-eta)*(atan((fqt-sf2)/(eta))&
              &          -atan((fqt-sf1)/(eta)))
            cA12=0.50*(sf1-fqt+(j_dpc)*eta)*log(((fqt-sf2)**2.0+eta*eta)&
              &          /(((fqt-sf1)**2.0+eta*eta)))

            cA13=-((sf1+fqt)*(-1.0*j_dpc)-eta)*(atan((fqt+sf2)/(eta))&
              &          -atan((fqt+sf1)/(eta)))
            cA14=0.50*(sf1+fqt+(-1.0*j_dpc)*eta)*log(((fqt+sf2)**2.0+eta*eta)&
              &          /(((fqt+sf1)**2.0+eta*eta)))

            cA1=cA11+cA12+cA13+cA14
            cA1=cA1/step1
#endif
          endif

          if (isf==nsf) then
            c2=(0.0,0.0)
            step2=1.0
#ifdef CPLX
            cA2=(0.0,0.0)
            step2=1.0
#endif
          else
            sf1=pol%dSFreqGrid(isf)/ryd
            sf2=pol%dSFreqGrid(isf+1)/ryd

            step2=sf2-sf1

            c21=((sf2-fqt)*(-1.0*j_dpc)-eta)*(atan((fqt-sf1)/(eta))&
              &          -atan((fqt-sf2)/(eta)))
            c22=0.50*(sf2-fqt+(-1.0*j_dpc)*eta)*log(((fqt-sf1)**2.0+eta*eta)&
              &          /(((fqt-sf2)**2.0+eta*eta)))

            c23=-((sf2+fqt)*j_dpc-eta)*(atan((fqt+sf1)/(eta))&
              &          -atan((fqt+sf2)/(eta)))
            c24=0.50*(sf2+fqt+j_dpc*eta)*log(((fqt+sf1)**2.0+eta*eta)&
              &          /(((fqt+sf2)**2.0+eta*eta)))

            c2=c21+c22+c23+c24

            c2=c2/step2

#ifdef CPLX
            cA21=((sf2-fqt)*(j_dpc)-eta)*(atan((fqt-sf1)/(eta))&
              &          -atan((fqt-sf2)/(eta)))
            cA22=0.50*(sf2-fqt+(j_dpc)*eta)*log(((fqt-sf1)**2.0+eta*eta)&
              &          /(((fqt-sf2)**2.0+eta*eta)))

            cA23=-((sf2+fqt)*(-1.0*j_dpc)-eta)*(atan((fqt+sf1)/(eta))&
              &          -atan((fqt+sf2)/(eta)))
            cA24=0.50*(sf2+fqt+(-1.0*j_dpc)*eta)*log(((fqt+sf1)**2.0+eta*eta)&
              &          /(((fqt+sf2)**2.0+eta*eta)))

            cA2=cA21+cA22+cA23+cA24

            cA2=cA2/step2
#endif
          endif

          if (isf==1.or.isf==nsf) then
            htwR(jj,isf)=0.5d0*(c1/step1+c2/step2)
#ifdef CPLX
            htwA(jj,isf)=0.5d0*(cA1/step1+cA2/step2)
#endif
          else
            htwR(jj,isf)=1.0d0*(c1+c2)/(step1+step2)
#ifdef CPLX
            htwA(jj,isf)=1.0d0*(cA1+cA2)/(step1+step2)
#endif
          endif

        enddo
      enddo

      if (peinf%inode.eq.0) call timacc(53,2)

    endif!(pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)

    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
      pol%chiTDyn=(0.0,0.0)
    endif

    call progress_init(prog_info, 'building polarizability matrix', 'blocks', &
      nrk*kp%nspin*(cwfn%nband-vwfn%nband))
    nwarn=0
    do irk=1,nrk
#ifdef VERBOSE
      if(peinf%inode.eq.0) then
        write(6,'(A,i8,6x,A,i8,A)') '### irk=',irk,'(nrk=',nrk,')'
      endif
#endif
#ifdef MPI
      call MPI_barrier(MPI_COMM_WORLD,mpierr)
#endif
      do ispin=1,kp%nspin
        if (peinf%inode.eq.0) call timacc(44,1)

        iowna(:)=1
        ntot=(vwfn%nband+pol%ncrit)*nst(irk)
        if (pol%freq_dep .eq. 0) then
          this%chilocal=0
          SAFE_ALLOCATE(this%gmetempr, (scal%npr,ntot))
          SAFE_ALLOCATE(this%gmetempc, (ntot,scal%npc))
        endif
        if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)) then
          this%chilocalRDyn=0
          SAFE_ALLOCATE(this%gmeRDyntempr2, (scal%npr,ntot,pol%os_nfreq_para))
          SAFE_ALLOCATE(this%gmeRDyntempc, (ntot,scal%npc))
#ifdef CPLX
          this%chilocalADyn=0
          SAFE_ALLOCATE(this%gmeADyntempr2, (scal%npr,ntot,pol%os_nfreq_para))
          SAFE_ALLOCATE(this%gmeADyntempc, (ntot,scal%npc))
#endif
        endif

        if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
          this%chilocalRDyn=(0.0,0.0)
         
          max_nv=0
          do ic=1,cwfn%nband-vwfn%nband          
            count_v=0
            do iv=1,(vwfn%nband+pol%ncrit)

              isend=peinf%global_pairowner(iv,ic)-1
              if (isend .lt. 0) then
                write(0,*) 'Illegal value for mpi proc, isend: ',iv,ic
                call die("internal error in chi_summation")
              endif
              if (isend .eq. peinf%inode) then
                zvalue=-pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1)
                iowna(iv)=iowna(iv)+1
              endif

#ifdef MPI
              call MPI_Bcast(zvalue,1,MPI_REAL_DP,isend,MPI_COMM_WORLD,mpierr)
              call MPI_Bcast(pol%dSFreqGrid,pol%os_nsfreq_para,MPI_REAL_DP,isend,MPI_COMM_WORLD,mpierr)
#endif

              if (scal%npr*scal%npc .ne. 0) then
                do it=1, nst(irk)

                  if (abs(zvalue) .gt. Tol_Zero) then
                    isfreql=-1
                    do jj=pol%nsfreq,1,-1
                      if ((pol%dSFreqGrid(jj)/ryd)<zvalue) then
                        isfreql=jj
                        EXIT
                      endif
                    enddo

                    if (isfreql.eq.pol%nsfreq) then
                      nwarn=nwarn+1
                      if (nwarn==1.and.peinf%inode.eq.0) then
                        write(0,*) 'WARNING: for accuracy, sfrequency_high_cutoff should be '
                        write(0,*) 'larger than energy difference between highest unoccupied '
                        write(0,*) 'state and lowest occupied state.'
                      endif
                      cycle
                    endif

                    isfreqr=isfreql+1

                    count_v(isfreql)=count_v(isfreql)+1
                    count_v(isfreqr)=count_v(isfreqr)+1
                  endif
                enddo !it
              endif
            enddo !iv

            if (max_nv<maxval(count_v(:))) then
              max_nv=maxval(count_v(:))
            endif
          enddo !ic

          SAFE_ALLOCATE(this%gmeRDyntempr2, (scal%npr,max_nv,pol%os_nsfreq_para))
          SAFE_ALLOCATE(this%gmeRDyntempc, (ntot,scal%npc))
          SAFE_ALLOCATE(this%gmeRDyntempcs, (max_nv,scal%npc))

          this%gmeRDyntempr2=(0.0,0.0)
          this%gmeRDyntempc=(0.0,0.0)
          this%gmeRDyntempcs=(0.0,0.0)

          SAFE_ALLOCATE(ind_v, (pol%os_nsfreq_para, max_nv))

          this%gmeRDyntempr2=(0.0,0.0)
          iowna(:)=1
        endif!(pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)

        if (peinf%inode.eq.0) call timacc(44,2)

        do ic=1,cwfn%nband-vwfn%nband
          call progress_step(prog_info)

          ! We do two giant loops here for freq_dep cases
          if (pol%freq_dep .eq. 0) then
            itot=0
            if (peinf%inode.eq.0) call timacc(14,1)
            do iv=1,(vwfn%nband+pol%ncrit)
              isend=peinf%global_pairowner(iv,ic)-1
              if (isend .lt. 0) then
                write(0,*) 'Illegal value for mpi proc, isend:',&
                  peinf%inode,iv,ic,peinf%global_pairowner(iv,ic)
                call die("internal error in chi_summation")
              endif
              if (isend .eq. peinf%inode) then
                if (iowna(iv) .gt. peinf%ncownactual) call die('iowna(iv) bigger than ncownactual')
                this%gmetempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv), &
                  ispin,irk,1) * sqrt(this%fact)
                iowna(iv)=iowna(iv)+1
              endif
#ifdef MPI
              call MPI_Bcast(this%gmetempn,pol%nmtx,MPI_SCALAR,isend,MPI_COMM_WORLD,mpierr)
#endif
              if (scal%npr*scal%npc .ne. 0) then

                do it =1, nst(irk)
                  itot = itot + 1

                  do icurr=1,scal%npr
                    this%gmetempr(icurr,itot)=this%gmetempn(indt(scal%imyrow(icurr),it,irk)) * &
                      pht(scal%imyrow(icurr),it,irk)
                  enddo

                  do icurr=1,scal%npc
                    temp_gme = this%gmetempn(indt(scal%imycol(icurr),it,irk))
                    this%gmetempc(itot,icurr)=MYCONJG(temp_gme * pht(scal%imycol(icurr),it,irk) )
                  enddo
                enddo ! it
              endif
            enddo ! iv
            if (peinf%inode.eq.0) call timacc(14,2)

            ! JRD: Using Level3 BLAS here for better performance

            if (scal%npr*scal%npc .ne. 0 .and. ntot > 0) then
              if (peinf%inode.eq.0) call timacc(30,1)

              call X(gemm)('n','n',scal%npr,scal%npc,ntot, &
                -ONE,this%gmetempr,scal%npr,this%gmetempc,ntot,ONE,this%chilocal,scal%npr)

              if (peinf%inode.eq.0) call timacc(30,2)
            endif

          endif ! pol%freq_dep .eq. 0

          !---------------------
          ! JRD: Full Frequency Be Here

          if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)) then

            if (peinf%inode.eq.0) call timacc(44,1)

            SAFE_ALLOCATE(edenDRtemp, (pol%os_nfreq_para))
#ifdef CPLX
            SAFE_ALLOCATE(edenDAtemp, (pol%os_nfreq_para))
#endif
            if (peinf%inode.eq.0) call timacc(44,2)

            itot=0
            do iv=1,(vwfn%nband+pol%ncrit)
              if (peinf%inode.eq.0) call timacc(14,1)

              isend=peinf%global_pairowner(iv,ic)-1
              if (isend .lt. 0) then
                write(0,*) 'Illegal value for mpi proc, isend: ',iv,ic
                call die("internal error in chi_summation")
              endif
              if (isend .eq. peinf%inode) then
                this%gmeRDyntempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv), &
                  ispin,irk,1) * sqrt(this%fact)
                if (abs(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1)) .gt. Tol_Zero) then
                  do jj=1+peinf%rank_mtxel,pol%nfreq,pol%os_para_freqs
                    ifreq_para=(jj+pol%os_para_freqs-1)/pol%os_para_freqs
                    edenDRtemp(ifreq_para)= -0.50d0 * ( 1d0/(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1) &
                      -(pol%dFreqGrid(jj)+pol%dFreqBrd(jj))/ryd)+& 
                      1d0/(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1)+&
                      (pol%dFreqGrid(jj)+pol%dFreqBrd(jj))/ryd))
                  enddo
                else
                  edenDRtemp(:)=0D0
                endif
#ifdef CPLX
                this%gmeADyntempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv),ispin,irk,1) * sqrt(this%fact)
                if (abs(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1)) .gt. Tol_Zero) then
                  do jj=1+peinf%rank_mtxel,pol%nfreq,pol%os_para_freqs
                    ifreq_para=(jj+pol%os_para_freqs-1)/pol%os_para_freqs
                    edenDAtemp(ifreq_para)= -0.50d0 * ( 1d0/(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1) &
                     -(pol%dFreqGrid(jj)-pol%dFreqBrd(jj))/ryd)+ &
                     1d0/(pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1)+&
                     (pol%dFreqGrid(jj)-pol%dFreqBrd(jj))/ryd))
                  enddo
                else
                  edenDAtemp(:)=0D0
                endif
#endif
                iowna(iv)=iowna(iv)+1
              endif

#ifdef MPI
              call MPI_Bcast(this%gmeRDyntempn,pol%nmtx,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr)
              call MPI_Bcast(edenDRtemp,pol%os_nfreq_para,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr)
#ifdef CPLX
              call MPI_Bcast(this%gmeADyntempn,pol%nmtx,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr)
              call MPI_Bcast(edenDAtemp,pol%os_nfreq_para,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr)
#endif
#endif
              if (peinf%inode.eq.0) call timacc(14,2)

              if (scal%npr*scal%npc .ne. 0) then
                do it =1, nst(irk)
                  if (peinf%inode.eq.0) call timacc(51,1)

                  itot = itot + 1
                  do icurr=1,scal%npr
                    this%gmeRDyntempr2(icurr,itot,:)= &
                      (this%gmeRDyntempn(indt(scal%imyrow(icurr),it,irk))*pht(scal%imyrow(icurr),it,irk))*edenDRtemp
                  enddo

                  if (peinf%inode.eq.0) call timacc(51,2)

                  if (peinf%inode.eq.0) call timacc(52,1)

                  do icurr=1,scal%npc
                    this%gmeRDyntempc(itot,icurr) = &
                      CONJG(this%gmeRDyntempn(indt(scal%imycol(icurr),it,irk))*pht(scal%imycol(icurr),it,irk))
                  enddo

                  if (peinf%inode.eq.0) call timacc(52,2)

#ifdef CPLX
                  if (peinf%inode.eq.0) call timacc(51,1)
                  do icurr=1,scal%npr
                    this%gmeADyntempr2(icurr,itot,:)=this%gmeADyntempn( &
                      indt(scal%imyrow(icurr),it,irk))*pht(scal%imyrow(icurr),it,irk)*edenDAtemp
                  enddo
                  if (peinf%inode.eq.0) call timacc(51,2)
                  if (peinf%inode.eq.0) call timacc(52,1)
                  do icurr=1,scal%npc
                    this%gmeADyntempc(itot,icurr) = &
                      CONJG(this%gmeADyntempn(indt(scal%imycol(icurr),it,irk))*pht(scal%imycol(icurr),it,irk))
                  enddo
                  if (peinf%inode.eq.0) call timacc(52,2)
#endif
                enddo ! it
              endif

            enddo ! iv


            ! JRD: Using Level3 BLAS here for better performance

            if (scal%npr*scal%npc .ne. 0 .and. ntot > 0) then
              do jj =1+peinf%rank_mtxel,pol%nfreq,pol%os_para_freqs
                if (peinf%inode.eq.0) call timacc(30,1)
                ifreq_para=(jj+pol%os_para_freqs-1)/pol%os_para_freqs
                call zgemm('n','n',scal%npr,scal%npc,ntot,(-1D0,0D0),this%gmeRDyntempr2(:,:,ifreq_para),scal%npr, &
                  this%gmeRDyntempc(:,:),ntot,(1D0,0D0),this%chilocalRDyn(:,:,ifreq_para),scal%npr)
                if (peinf%inode.eq.0) call timacc(30,2)
#ifdef CPLX
                if (peinf%inode.eq.0) call timacc(30,1)
                call zgemm('n','n',scal%npr,scal%npc,ntot,(-1D0,0D0),this%gmeADyntempr2(:,:,ifreq_para),scal%npr, &
                  this%gmeADyntempc(:,:),ntot,(1D0,0D0),this%chilocalADyn(:,:,ifreq_para),scal%npr)
                if (peinf%inode.eq.0) call timacc(30,2)
#endif
              enddo
            endif

            if (peinf%inode.eq.0) call timacc(44,1)
            SAFE_DEALLOCATE(edenDRtemp)
#ifdef CPLX
            SAFE_DEALLOCATE(edenDAtemp)
#endif
            if (peinf%inode.eq.0) call timacc(44,2)

          endif ! (pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)

          !---------------------
          ! Full Frequency Be Here(shishkin and Kresse 2006)

          if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
            count_v=0
            ind_v=0
            ind_sf=0
            nsftot=0
            itot=0

            do iv=1,(vwfn%nband+pol%ncrit)
              if (peinf%inode.eq.0) call timacc(14,1)

              isend=peinf%global_pairowner(iv,ic)-1
              if (isend .lt. 0) then
                write(0,*) 'Illegal value for mpi proc, isend: ',iv,ic
                call die("internal error in chi_summation")
              endif
              if (isend .eq. peinf%inode) then
                this%gmeRDyntempn(:) = pol%gme(:,iowna(iv),peinf%indexv(iv), &
                  ispin,irk,1) * sqrt(this%fact)
                zvalue=-pol%edenDyn(peinf%indexv(iv),iowna(iv),ispin,irk,1)
                iowna(iv)=iowna(iv)+1
              endif

#ifdef MPI
              call MPI_Bcast(this%gmeRDyntempn,pol%nmtx,MPI_COMPLEX_DPC,isend,MPI_COMM_WORLD,mpierr)
              call MPI_Bcast(zvalue,1,MPI_REAL_DP,isend,MPI_COMM_WORLD,mpierr)
              call MPI_Bcast(pol%dSFreqGrid,pol%os_nsfreq_para,MPI_REAL_DP,isend,MPI_COMM_WORLD,mpierr)
#endif
              if (peinf%inode.eq.0) call timacc(14,2)
              ! compute spectral functions of the polarizability

              if (scal%npr*scal%npc .ne. 0) then
                do it=1, nst(irk)

                  if (abs(zvalue) .gt. Tol_Zero) then
                    if (peinf%inode.eq.0) call timacc(51,1)

                    itot=itot+1
                    isfreql=-1
                    do jj=pol%nsfreq,1,-1
                      if ((pol%dSFreqGrid(jj)/ryd)<zvalue) then
                        isfreql=jj
                        EXIT
                      endif
                    enddo

                    if (isfreql.eq.pol%nsfreq) then
                      cycle
                    endif
                  
                    isfreqr=isfreql+1
                  
                    count_v(isfreql)=count_v(isfreql)+1
                    count_v(isfreqr)=count_v(isfreqr)+1
                  
                    il=count_v(isfreql)
                    ir=count_v(isfreqr)
                  
                    ind_v(isfreql,il)=itot
                    ind_v(isfreqr,ir)=itot
                  
                    sfreql=pol%dSFreqGrid(isfreql)/ryd
                    sfreqr=pol%dSFreqGrid(isfreqr)/ryd
                  
                    wl=-(zvalue-sfreqr)/(sfreqr-sfreql)
                    wr=(zvalue-sfreql)/(sfreqr-sfreql)
                  
                    do icurr=1,scal%npr
                      this%gmeRDyntempr2(icurr,il,isfreql)=this%gmeRDyntempn( &
                        indt(scal%imyrow(icurr),it,irk))*pht(scal%imyrow(icurr),it,irk)*wl
                      this%gmeRDyntempr2(icurr,ir,isfreqr)=this%gmeRDyntempn( &
                        indt(scal%imyrow(icurr),it,irk))*pht(scal%imyrow(icurr),it,irk)*wr
                    enddo
                  
                    if (peinf%inode.eq.0) call timacc(51,2)
                  
                    if (peinf%inode.eq.0) call timacc(52,1)
                  
                    do icurr=1,scal%npc
                      this%gmeRDyntempc(itot,icurr) = &
                        CONJG(this%gmeRDyntempn(indt(scal%imycol(icurr),it,irk))*pht(scal%imycol(icurr),it,irk))
                    enddo

                    if (peinf%inode.eq.0) call timacc(52,2)
                  endif

                enddo ! it
              endif

            enddo ! iv

            if (peinf%inode.eq.0) call timacc(44,1)

            jj=0
            do ii=1+peinf%rank_mtxel,pol%nsfreq,pol%os_para_freqs
              if (count_v(ii)>0) then
                jj=jj+1
                ind_sf(jj)=ii
              endif
            enddo
            nsftot=jj
            if (peinf%inode.eq.0) call timacc(44,2)

            if (scal%npr*scal%npc .ne. 0 .and. ntot > 0) then
              do ii=1, nsftot
                if (peinf%inode.eq.0) call timacc(30,1)

                n1=ind_sf(ii)
                n2=count_v(n1)
                
                do jj=1,n2
                  n3=ind_v(n1,jj)
                  this%gmeRDyntempcs(jj,:)=this%gmeRDyntempc(n3,:)
                enddo

                call zgemm('n','n',scal%npr,scal%npc,n2,(-1D0,0D0),this%gmeRDyntempr2(:,:,n1),scal%npr, &
                  this%gmeRDyntempcs(:,:),max_nv,(1D0,0D0),this%chilocalRDyn(:,:,n1),scal%npr)

                if (peinf%inode.eq.0) call timacc(30,2)
              enddo
            endif
          endif ! (pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)

        enddo ! ic (loop over conduction bands)

        if (peinf%inode.eq.0) call timacc(44,1)
        if (pol%freq_dep .eq. 0) then
          if (scal%npr*scal%npc .ne. 0) then
            pol%chi(:,:,ispin) = pol%chi(:,:,ispin) + this%chilocal(:,:)
          endif
          SAFE_DEALLOCATE(this%gmetempr)
          SAFE_DEALLOCATE(this%gmetempc)
        endif

        if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)) then
          if (scal%npr*scal%npc .ne. 0) then
            do jj = 1+peinf%rank_mtxel,pol%nfreq,pol%os_para_freqs
              ifreq_para=(jj+pol%os_para_freqs-1)/pol%os_para_freqs 
              pol%chiRDyn(ifreq_para,:,:,ispin) = pol%chiRDyn(ifreq_para,:,:,ispin) + this%chilocalRDyn(:,:,ifreq_para)
#ifdef CPLX
              pol%chiADyn(ifreq_para,:,:,ispin) = pol%chiADyn(ifreq_para,:,:,ispin) + this%chilocalADyn(:,:,ifreq_para)
#endif
            enddo
          endif
          SAFE_DEALLOCATE(this%gmeRDyntempr2)
          SAFE_DEALLOCATE(this%gmeRDyntempc)
#ifdef CPLX
          SAFE_DEALLOCATE(this%gmeADyntempr2)
          SAFE_DEALLOCATE(this%gmeADyntempc)
#endif
        endif
        if (peinf%inode.eq.0) call timacc(44,2)

        if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then

          if (peinf%inode.eq.0) call timacc(44,1)
          if (scal%npr*scal%npc .ne. 0) then
            do jj = 1+peinf%rank_mtxel, pol%nsfreq,pol%os_para_freqs
              pol%chiTDyn(jj,:,:,ispin) = pol%chiTDyn(jj,:,:,ispin) + this%chilocalRDyn(:,:,jj)
            enddo
          endif
          SAFE_DEALLOCATE(this%gmeRDyntempr2)
          SAFE_DEALLOCATE(this%gmeRDyntempc)
          SAFE_DEALLOCATE(this%gmeRDyntempcs)
          SAFE_DEALLOCATE(ind_v)
          if (peinf%inode.eq.0) call timacc(44,2)
        endif
        
      enddo ! ispin (loop over spins)
    enddo ! irk (loop over k-points in set rk)
    call progress_free(prog_info)

    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then

      ! -------------------------
      ! Hilbert transform
      ! -------------------------

      if (peinf%inode.eq.0) call timacc(53,1)

      call zgemm('n','n',pol%nfreq,scal%npr*scal%npc*kp%nspin,pol%os_nsfreq_para, &
        (-1D0,0D0),htwR(:,:),pol%nfreq,pol%chiTDyn(:,:,:,:),pol%os_nsfreq_para, &
        (0D0,0D0),pol%chiRDyn(:,:,:,:),pol%nfreq)

#ifdef CPLX
      call zgemm('n','n',pol%nfreq,scal%npr*scal%npc*kp%nspin,pol%os_nsfreq_para, &
        (-1D0,0D0),htwA(:,:),pol%nfreq,pol%chiTDyn(:,:,:,:),pol%os_nsfreq_para, &
        (0D0,0D0),pol%chiADyn(:,:,:,:),pol%nfreq)
#endif

       if (peinf%inode.eq.0) call timacc(53,2)
    endif !(pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)

    if (peinf%inode.eq.0) call timacc(44,1)
    if (pol%freq_dep .eq. 0) then
      SAFE_DEALLOCATE(this%chilocal)
    endif
    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)) then
      SAFE_DEALLOCATE(this%chilocalRDyn)
#ifdef CPLX
      SAFE_DEALLOCATE(this%chilocalADyn)
#endif
    endif
    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
      SAFE_DEALLOCATE(this%chilocalRDyn)
      SAFE_DEALLOCATE(count_v)
      SAFE_DEALLOCATE(ind_sf)
    endif

    SAFE_DEALLOCATE(iowna)
    !call free_summation_buffers(pol)

    if (peinf%inode.eq.0) call timacc(44,2)

    POP_SUB(chi_summation_comm_elements)
    return

  end subroutine chi_summation_comm_elements

end module chi_summation_m 
