!=================================================================================
!
! Module write_matrix_m
!
! (1) write_matrix_d()          Originally by JRD       Last Modified 5/1/2008 (JRD)
!
! This program writes a distributed matrix like chimat or epsmat to file.
!
! (2) write_matrix_f()          Originally by JRD       Last Modified 2/5/2009 (CHP)
!
! Modification of write_matrix_d for full-frequency.
!
!=================================================================================

#include "f_defs.h"

module write_matrix_m

  use global_m
#ifdef HDF5
  use hdf5
#endif
  use scalapack_m
  use io_utils_m

  implicit none

  private

  public :: &
    write_matrix_d, &
    write_matrix_f
#ifdef HDF5
  public :: &
    write_matrix_ser_hdf, &
    write_matrix_f_ser_hdf, &
    write_matrix_d_hdf, &
    write_matrix_f_hdf, &
    write_matrix_diagonal_hdf, &
    write_gvec_indices_hdf
#ifdef USESCALAPACK
  public :: &
    write_matrix_d_par_hdf, &
    write_matrix_f_par_hdf
#endif
#endif

contains

!===================================================================================

subroutine write_matrix_d(scal,matrix,nmtx,iunit)
  type(scalapack), intent(in) :: scal
  SCALAR, intent(in) :: matrix(:,:) !< (scal%npr,scal%npc)
  integer, intent(in) :: nmtx
  integer, intent(in) :: iunit
  
  integer :: ii, jj
#ifdef USESCALAPACK
  SCALAR, allocatable :: tempcol(:),tempcol2(:)
  integer :: irow, icol, irowm, icolm
  integer :: icurr
#endif
  type(progress_info) :: prog_info !< a user-friendly progress report

  PUSH_SUB(write_matrix_d)

#ifdef VERBOSE
  if(peinf%inode .eq. 0) then
    write(6,*) ' Writing matrix: ', nmtx, iunit
    write(6,*) ' '
  endif
#endif

#ifdef USESCALAPACK
  SAFE_ALLOCATE(tempcol, (nmtx))
  SAFE_ALLOCATE(tempcol2, (nmtx))
  
  icurr=0
  
  call progress_init(prog_info, 'writing matrix', 'column', nmtx)
  do jj = 1, nmtx
    call progress_step(prog_info, jj)
!        if (peinf%inode .eq. 0) then
!          write(6,*) ' In loop: ', ii
!        endif
    icol=MOD(INT(((jj-1)/scal%nbl)+TOL_SMALL),scal%npcol)
    tempcol=0d0
    if (icol .eq. scal%mypcol) then
      do ii = 1, nmtx
        irow=MOD(INT(((ii-1)/scal%nbl)+TOL_SMALL),scal%nprow)
        if (irow .eq. scal%myprow) then
          icurr=icurr+1
          icolm=INT((icurr-1)/scal%npr+TOL_SMALL)+1
          irowm=MOD((icurr-1),scal%npr)+1
          tempcol(ii)=matrix(irowm,icolm)
          
!                if (icolm .gt. scal%npc .or. irowm.gt.scal%npr) then
!                  write(6,*) 'Error: ', scal%npc,scal%npr,icolm,irowm
!                endif

        endif
      enddo
    endif
    if (peinf%inode .eq. 0) then
      tempcol2=0d0
    endif
    call MPI_REDUCE(tempcol,tempcol2,nmtx,MPI_SCALAR,MPI_SUM,0, &
      MPI_COMM_WORLD,mpierr)
    if (peinf%inode .eq. 0) then
      write(iunit) (tempcol2(ii),ii=1,nmtx)
    endif
    
    call MPI_barrier(MPI_COMM_WORLD,mpierr)
    
  enddo
  call progress_free(prog_info)
  
  SAFE_DEALLOCATE(tempcol)
  SAFE_DEALLOCATE(tempcol2)

!      if(peinf%inode .eq. 0) then
!        write(6,*) ' Done Writing chimat: '
!      endif

#else

  if (peinf%inode .eq. 0) then
    call progress_init(prog_info, 'writing matrix', 'column', nmtx)
    do jj = 1, nmtx
      call progress_step(prog_info, jj)
      write(iunit) (matrix(ii, jj), ii = 1, nmtx)
    enddo
    call progress_free(prog_info)
  endif
  
#endif
  
  POP_SUB(write_matrix_d)
  
  return
end subroutine write_matrix_d

!=================================================================================

subroutine write_matrix_f(scal,nfreq,retarded,ONLYIFCPLX(advanced)nmtx,iunit,para_freqs)
  type(scalapack), intent(in) :: scal
  integer, intent(in) :: nfreq
  complex(DPC), intent(in) :: retarded(:,:,:) !< (nfreq_para,scal%npr,scal%npc)
#ifdef CPLX
  complex(DPC), intent(in) :: advanced(:,:,:) !< (nfreq_para,scal%npr,scal%npc)
#endif
  integer, intent(in) :: nmtx
  integer, intent(in) :: iunit
  integer, intent(in) :: para_freqs
  
  integer :: ii, jj, ifreq
#ifdef USESCALAPACK
  complex(DPC), allocatable :: tempcolR(:,:),tempcolR2(:,:)
  complex(DPC), allocatable :: tempcolA(:,:),tempcolA2(:,:)
  integer :: irow, icol, irowm, icolm,freq_grp_ind,ifreq_para
  integer :: icurr
#endif
  type(progress_info) :: prog_info !< a user-friendly progress report

  PUSH_SUB(write_matrix_f)
  
#ifdef VERBOSE
  if(peinf%inode .eq. 0) then
    write(6,*) ' Writing matrix: ', nfreq, nmtx, iunit
    write(6,*) ' '
  endif
#endif
  
#ifdef USESCALAPACK
  
  SAFE_ALLOCATE(tempcolR, (nfreq,nmtx))
  SAFE_ALLOCATE(tempcolR2, (nfreq,nmtx))
#ifdef CPLX
  SAFE_ALLOCATE(tempcolA, (nfreq,nmtx))
  SAFE_ALLOCATE(tempcolA2, (nfreq,nmtx))
#endif

  icurr=0
  
  call progress_init(prog_info, 'writing matrix', 'column', nmtx)
  do jj = 1, nmtx
    call progress_step(prog_info, jj)
!        if (peinf%inode .eq. 0) then
!          write(6,*) ' In loop: ', ii
!        endif
    icol=MOD(INT(((jj-1)/scal%nbl)+TOL_SMALL),scal%npcol)
    tempcolR=0d0
#ifdef CPLX
    tempcolA=0d0
#endif
    if (icol .eq. scal%mypcol) then
      do ii = 1, nmtx
        irow=MOD(INT(((ii-1)/scal%nbl)+TOL_SMALL),scal%nprow)
        if (irow .eq. scal%myprow) then
          icurr=icurr+1
          icolm=INT((icurr-1)/scal%npr+TOL_SMALL)+1
          irowm=MOD((icurr-1),scal%npr)+1
          do ifreq=1,nfreq
            freq_grp_ind=mod(ifreq-1,para_freqs)
            ifreq_para=(ifreq+para_freqs-1)/para_freqs
            if(freq_grp_ind .eq. peinf%rank_mtxel) then
              tempcolR(ifreq,ii)=retarded(ifreq_para,irowm,icolm)
#ifdef CPLX
              tempcolA(ifreq,ii)=advanced(ifreq_para,irowm,icolm)
#endif
            endif
          enddo
        endif
      enddo
    endif
    if (peinf%inode .eq. 0) then
      tempcolR2=0d0
#ifdef CPLX
      tempcolA2=0d0
#endif
    endif
    call MPI_REDUCE(tempcolR(1,1),tempcolR2(1,1),nfreq*nmtx, &
      MPI_COMPLEX_DPC,MPI_SUM,0,MPI_COMM_WORLD,mpierr)
#ifdef CPLX
    call MPI_REDUCE(tempcolA(1,1),tempcolA2(1,1),nfreq*nmtx, &
      MPI_COMPLEX_DPC,MPI_SUM,0,MPI_COMM_WORLD,mpierr)
#endif
    if (peinf%inode .eq. 0) then
      do ii = 1, nmtx
        write(iunit) (tempcolR2(ifreq,ii),ifreq=1,nfreq)
      enddo
#ifdef CPLX
      do ii = 1, nmtx
        write(iunit) (tempcolA2(ifreq,ii),ifreq=1,nfreq)
      enddo
#endif
    endif
    
    call MPI_barrier(MPI_COMM_WORLD,mpierr)
    
  enddo
  call progress_free(prog_info)
  
  SAFE_DEALLOCATE(tempcolR)
  SAFE_DEALLOCATE(tempcolR2)
#ifdef CPLX
  SAFE_DEALLOCATE(tempcolA)
  SAFE_DEALLOCATE(tempcolA2)
#endif

#else

  if(peinf%inode .eq. 0) then
    call progress_init(prog_info, 'writing matrix', 'column', nmtx)
    do jj = 1, nmtx
      call progress_step(prog_info, jj)
      do ii = 1, nmtx
        write(iunit) (retarded(ifreq, ii, jj), ifreq= 1, nfreq)
      enddo
#ifdef CPLX
      do ii = 1, nmtx
        write(iunit) (advanced(ifreq, ii, jj),ifreq = 1, nfreq)
      enddo
#endif
    enddo
    call progress_free(prog_info)
  endif
  
#endif
  
  POP_SUB(write_matrix_f)
  
  return
end subroutine write_matrix_f

#ifdef HDF5

!========================================================================
! JRD: The HDF5 Equivalents of the above routines.
!========================================================================

!==========================================================================================

subroutine write_matrix_diagonal_hdf(epsdiag,nmtx,iq,isize,name)
  real(DP), intent(in) :: epsdiag(:,:,:) !< (isize,nmtx,1)
  integer, intent(in) :: nmtx
  integer, intent(in) :: iq
  integer, intent(in) :: isize
  character(len=*), intent(in) :: name

  integer(HID_T) :: file_id       ! File identifier
  integer(HID_T) :: dset_id       ! Dataset identifier
  integer(HID_T) :: filespace     ! Dataspace identifier in file
  integer(HID_T) :: memspace      ! Dataspace identifier in mem

  integer(HSIZE_T) :: dims(3), offset(3), offsetm(3)

  integer :: error, rank

  PUSH_SUB(write_matrix_diagonal_hdf)

  call h5fopen_f(trim(name), H5F_ACC_RDWR_F, file_id, error)

! Write Array

  rank = 3
  dims(1) = isize
  dims(2) = nmtx
  dims(3) = 1
  offset(1) = 0
  offset(2) = 0
  offset(3) = iq - 1
  offsetm(:) = 0

  call h5dopen_f(file_id, 'matrix-diagonal', dset_id, error)
  call h5screate_simple_f(rank, dims, memspace, error)
  call h5dget_space_f(dset_id,filespace,error)
  call h5sselect_hyperslab_f(memspace, H5S_SELECT_SET_F, offsetm, dims, error)
  call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, offset, dims, error)
  call h5dwrite_f(dset_id, H5T_NATIVE_DOUBLE, epsdiag, dims, error, memspace, filespace)
  call h5dclose_f(dset_id, error)
  call h5sclose_f(memspace, error)
  call h5sclose_f(filespace, error)

  call h5fclose_f(file_id, error)

  POP_SUB(write_matrix_diagonal_hdf)

end subroutine write_matrix_diagonal_hdf

!===================================================================================

subroutine write_matrix_ser_hdf(matrix,nmtx,iq,is,name)
  SCALAR, intent(in) :: matrix(:,:) !< (nmtx,nmtx)
  integer, intent(in) :: nmtx
  integer, intent(in) :: iq
  integer, intent(in) :: is
  character(len=*), intent(in) :: name

  integer :: error, rank, ii, jj

  integer(HID_T) :: file_id       ! File identifier
  integer(HID_T) :: dset_id       ! Dataset identifier
  integer(HID_T) :: filespace     ! Dataspace identifier in file
  integer(HID_T) :: memspace      ! Dataspace identifier in mem

  integer(HSIZE_T) :: count(6), offset(6), offsetm(6)

  real(DP), allocatable :: data(:,:,:,:,:,:)

  PUSH_SUB(write_matrix_ser_hdf)

  call h5fopen_f(trim(name), H5F_ACC_RDWR_F, file_id, error)

  rank=6
  count(1) = SCALARSIZE
  count(2) = 1
  count(3) = nmtx
  count(4) = nmtx
  count(5) = 1
  count(6) = 1

  offset(:) = 0
  offset(5) = is - 1
  offset(6) = iq - 1

  offsetm(:) = 0

  SAFE_ALLOCATE(data,(count(1),count(2),count(3),count(4),count(5),count(6)))

  do jj = 1, nmtx
  do ii = 1, nmtx
    data(1,1,ii,jj,1,1) = dble(matrix(ii,jj))
#ifdef CPLX
    data(2,1,ii,jj,1,1) = IMAG(matrix(ii,jj))
#endif
  enddo
  enddo

  call h5dopen_f(file_id, 'matrix', dset_id, error)
  call h5screate_simple_f(rank, count, memspace, error)
  call h5dget_space_f(dset_id,filespace,error)
  call h5sselect_hyperslab_f(memspace, H5S_SELECT_SET_F, offsetm, count, error)
  call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, offset, count, error)
  call h5dwrite_f(dset_id, H5T_NATIVE_DOUBLE, data, count, error, memspace, filespace)

  SAFE_DEALLOCATE(data)

  call h5dclose_f(dset_id, error)
  call h5sclose_f(memspace, error)
  call h5sclose_f(filespace, error)

  call h5fclose_f(file_id, error)

  POP_SUB(write_matrix_ser_hdf)

end subroutine write_matrix_ser_hdf

!========================================================================

subroutine write_matrix_f_ser_hdf(nfreq,retarded,ONLYIFCPLX(advanced)nmtx,iq,is,name)
  integer, intent(in) :: nfreq
  complex(DPC), intent(in) :: retarded(:,:,:) !< (nfreq,nmtx,nmtx)
#ifdef CPLX
  complex(DPC), intent(in) :: advanced(:,:,:) !< (nfreq,nmtx,nmtx)
#endif
  integer, intent(in) :: nmtx
  integer, intent(in) :: iq
  integer, intent(in) :: is
  character(len=*), intent(in) :: name

  integer :: ii, jj, error, rank
  real(DP), allocatable :: data(:,:,:,:,:,:)
  type(progress_info) :: prog_info !< a user-friendly progress report

  integer(HID_T) :: file_id       ! File identifier
  integer(HID_T) :: dset_id       ! Dataset identifier
  integer(HID_T) :: filespace     ! Dataspace identifier in file
  integer(HID_T) :: memspace      ! Dataspace identifier in mem

  integer(HSIZE_T) :: count(6), offset(6), offsetm(6)

  PUSH_SUB(write_matrix_f_ser_hdf)

! DVF: this routine was built off of write_matrix_f_hdf to do the serial
! writing of an hdf format matrix. This is needed for epsmat_old2hdf5.f90

  rank=6
  count(1) = 2
  count(2) = nfreq
  count(3) = nmtx
  count(4) = 1
  count(5) = SCALARSIZE
  count(6) = 1

  SAFE_ALLOCATE(data, (count(1),count(2),count(3),count(4),count(5),count(6)))
  call h5fopen_f(trim(name), H5F_ACC_RDWR_F, file_id, error)
  call h5dopen_f(file_id, 'matrix', dset_id, error)
  call h5screate_simple_f(rank, count, memspace, error)
  call h5dget_space_f(dset_id,filespace,error)

  call progress_init(prog_info, 'writing matrix', 'column', nmtx)
  do jj = 1, nmtx
    call progress_step(prog_info, jj)
    do ii = 1, nmtx
      data(1,:,ii,1,1,1)=dble(retarded(:,ii,jj))
      data(2,:,ii,1,1,1)=IMAG(retarded(:,ii,jj))
#ifdef CPLX
      data(1,:,ii,1,2,1)=dble(advanced(:,ii,jj))
      data(2,:,ii,1,2,1)=IMAG(advanced(:,ii,jj))
#endif
    enddo

    offsetm(:) = 0
    call h5sselect_hyperslab_f(memspace, H5S_SELECT_SET_F, offsetm, count, error)

    offset(1)=0
    offset(2)=0
    offset(3)=0
    offset(4)=jj-1
    offset(5)=SCALARSIZE*(is-1)
    offset(6)=iq-1

    call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, offset, count, error)
    call h5dwrite_f(dset_id, H5T_NATIVE_DOUBLE, data, count, error, memspace, filespace)
  enddo
  call progress_free(prog_info)

  SAFE_DEALLOCATE(data)
  call h5dclose_f(dset_id, error)
  call h5sclose_f(memspace, error)
  call h5sclose_f(filespace, error)
  call h5fclose_f(file_id, error)

  POP_SUB(write_matrix_f_ser_hdf)
  
  return

end subroutine write_matrix_f_ser_hdf

!========================================================================

subroutine write_matrix_d_hdf(scal,matrix,nmtx,iq,is,name)
  type(scalapack), intent(in) :: scal
  SCALAR, intent(in) :: matrix(:,:) !< (scal%npr,scal%npc)
  integer, intent(in) :: nmtx
  integer, intent(in) :: iq
  integer, intent(in) :: is
  character(len=*), intent(in) :: name
  
  integer :: ii, jj, error, size, rank
#ifdef USESCALAPACK
  real(DP), allocatable :: datatmp(:,:,:,:,:,:)
  integer :: irow, icol, irowm, icolm
  integer :: icurr
#endif
  real(DP), allocatable :: data(:,:,:,:,:,:)
  type(progress_info) :: prog_info !< a user-friendly progress report

  integer(HID_T) :: file_id       ! File identifier
  integer(HID_T) :: dset_id       ! Dataset identifier
  integer(HID_T) :: filespace     ! Dataspace identifier in file
  integer(HID_T) :: memspace      ! Dataspace identifier in mem
#ifdef USESCALAPACK
!  integer(HID_T) :: plist_id      ! Property list identifier for parallel IO
!                                 ! Not used yet...
#endif

  integer(HSIZE_T) :: count(6), offset(6), offsetm(6)

  PUSH_SUB(write_matrix_d_hdf)

#ifdef VERBOSE
  if(peinf%inode .eq. 0) then
    write(6,*) ' Writing matrix: ', nmtx
    write(6,*) ' '
  endif
#endif

! XXX: For now, we will still have only proc 0 write...
! We should changes this to parallel writes. But doing
! this effectively from the scalapack, block cyclic layout
! seems a bit tricky. So, ignoring for now...

  rank=6
  count(1) = SCALARSIZE
  count(2) = 1
  count(3) = nmtx
  count(4) = 1
  count(5) = 1
  count(6) = 1

  if (peinf%inode .eq. 0) then
    SAFE_ALLOCATE(data,(count(1),count(2),count(3),count(4),count(5),count(6)))
    call h5fopen_f(trim(name), H5F_ACC_RDWR_F, file_id, error)
    call h5dopen_f(file_id, 'matrix', dset_id, error)
    call h5screate_simple_f(rank, count, memspace, error)
    call h5dget_space_f(dset_id,filespace,error)
  endif

#ifdef USESCALAPACK
  SAFE_ALLOCATE(datatmp, (count(1),count(2),count(3),count(4),count(5),count(6)))
  icurr=0  
#endif  

  call progress_init(prog_info, 'writing matrix', 'column', nmtx)
  do jj = 1, nmtx

    call progress_step(prog_info, jj)
#ifdef USESCALAPACK

    if(peinf%inode.eq.0) call timacc(47,1)

    icol=MOD(INT(((jj-1)/scal%nbl)+TOL_SMALL),scal%npcol)
    datatmp=0d0
    if (icol .eq. scal%mypcol) then
      do ii = 1, nmtx
        irow=MOD(INT(((ii-1)/scal%nbl)+TOL_SMALL),scal%nprow)
        if (irow .eq. scal%myprow) then
          icurr=icurr+1
          icolm=INT((icurr-1)/scal%npr+TOL_SMALL)+1
          irowm=MOD((icurr-1),scal%npr)+1
          datatmp(1,1,ii,1,1,1)=dble(matrix(irowm,icolm))
#ifdef CPLX
          datatmp(2,1,ii,1,1,1)=IMAG(matrix(irowm,icolm))
#endif
        endif
      enddo
    endif
    if (peinf%inode .eq. 0) then
      data=0d0
    endif

! XXX This is a big waste of communication. Should be fixed when do
! parallel IO.

    size = nmtx * SCALARSIZE

    call MPI_REDUCE(datatmp,data,size,MPI_DOUBLE_PRECISION,MPI_SUM,0, &
      MPI_COMM_WORLD,mpierr)

    if(peinf%inode.eq.0) call timacc(47,2)

#else

    if (peinf%inode .eq. 0) then
      do ii = 1, nmtx
        data(1,1,ii,1,1,1) = dble(matrix(ii,jj))
#ifdef CPLX
        data(2,1,ii,1,1,1) = IMAG(matrix(ii,jj))
#endif
      enddo
    endif

#endif

    if(peinf%inode.eq.0) call timacc(48,1)

    if (peinf%inode .eq. 0) then

      offsetm(:) = 0
      call h5sselect_hyperslab_f(memspace, H5S_SELECT_SET_F, offsetm, count, error)

      offset(1)=0
      offset(2)=0
      offset(3)=0
      offset(4)=jj-1
      offset(5)=is-1
      offset(6)=iq-1

      call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, offset, count, error)

      call h5dwrite_f(dset_id, H5T_NATIVE_DOUBLE, data, count, error, memspace, filespace)

    endif

    if(peinf%inode.eq.0) call timacc(48,2)

#ifdef USESCALAPACK
    call MPI_barrier(MPI_COMM_WORLD,mpierr)
#endif    

  enddo
  call progress_free(prog_info)

#ifdef USESCALAPACK
  SAFE_DEALLOCATE(datatmp)
#endif  

  if (peinf%inode .eq. 0) then
    SAFE_DEALLOCATE(data)
    call h5dclose_f(dset_id, error)
    call h5sclose_f(memspace, error)
    call h5sclose_f(filespace, error)
    call h5fclose_f(file_id, error)
  endif

  POP_SUB(write_matrix_d_hdf)
  
  return
end subroutine write_matrix_d_hdf

!========================================================================

#ifdef USESCALAPACK

subroutine write_matrix_d_par_hdf(scal,matrix,nmtx,iq,is,name)
  type(scalapack), intent(in) :: scal
  SCALAR, intent(in) :: matrix(:,:) !< (scal%npr,scal%npc)
  integer, intent(in) :: nmtx
  integer, intent(in) :: iq
  integer, intent(in) :: is
  character(len=*), intent(in) :: name
  
  integer :: ii, jj, error, rank
  real(DP), allocatable :: data(:,:,:,:,:,:)

  integer(HID_T) :: file_id       ! File identifier
  integer(HID_T) :: dset_id       ! Dataset identifier
  integer(HID_T) :: filespace     ! Dataspace identifier in file
  integer(HID_T) :: memspace      ! Dataspace identifier in mem
  integer(HID_T) :: plist_id      ! Property list identifier for parallel IO

  integer(HSIZE_T) :: count(6), countm(6), offset(6), offsetm(6), stride(6), block(6)
  integer(HSIZE_T) :: countr(6), offsetr(6), strider(6), blockr(6)

  integer :: comm, info, rowremainder, colremainder

  PUSH_SUB(write_matrix_d_par_hdf)

  if(peinf%inode.eq.0) call timacc(47,1)

! JRD: We need a barrier here or else parallel file opening gets mixed up with
! peinf%inode 0 opening the file to write the diagonal (which is called first).
  call MPI_barrier(MPI_COMM_WORLD,mpierr)

  comm = MPI_COMM_WORLD
  info = MPI_INFO_NULL

#ifdef VERBOSE
  if(peinf%inode .eq. 0) then
    write(6,*) ' Writing matrix: ', nmtx
    write(6,*) ' '
  endif
#endif

! JRD Should be ok with npr and npc = 0

  !if (scal%npr .eq. 0 .or. scal%npc .eq. 0) then
  !  write(6,*) peinf%inode,"Zero npr or npc!!", scal%npr, scal%npc
  !endif

  rank=6
  countm(1) = SCALARSIZE
  countm(2) = 1
  countm(3) = scal%npr
  countm(4) = scal%npc
  !countm(3) = scal%npr - mod(scal%npr,scal%nbl)
  !countm(4) = scal%npc - mod(scal%npc,scal%nbl)
  countm(5) = 1
  countm(6) = 1

  offsetm(:) = 0

  count(1) = 1
  count(2) = 1
  count(3) = scal%npr/scal%nbl
  count(4) = scal%npc/scal%nbl
  count(5) = 1
  count(6) = 1

  block(1) = SCALARSIZE
  block(2) = 1
  block(3) = scal%nbl
  block(4) = scal%nbl
  block(5) = 1
  block(6) = 1

  offset(1) = 0
  offset(2) = 0
  offset(3) = scal%myprow*scal%nbl
  offset(4) = scal%mypcol*scal%nbl
  offset(5) = is-1
  offset(6) = iq-1

  stride(1) = 1
  stride(2) = 1
  stride(3) = scal%nprow*scal%nbl
  stride(4) = scal%npcol*scal%nbl
  stride(5) = 1
  stride(6) = 1

  call h5pcreate_f(H5P_FILE_ACCESS_F, plist_id, error)
  call h5pset_fapl_mpio_f(plist_id, comm, info, error)
  call h5fopen_f(trim(name), H5F_ACC_RDWR_F, file_id, error, access_prp = plist_id)
  call h5pclose_f(plist_id,error)

  SAFE_ALLOCATE(data,(countm(1),countm(2),countm(3),countm(4),countm(5),countm(6)))
!XXX create data can we avoid duplication?
  do jj = 1, scal%npc
  !do jj = 1, scal%npc - mod(scal%npc,scal%nbl)
    do ii = 1, scal%npr
    !do ii = 1, scal%npr - mod(scal%npr,scal%nbl)
        data(1,1,ii,jj,1,1) = dble(matrix(ii,jj))
#ifdef CPLX
        data(2,1,ii,jj,1,1) = IMAG(matrix(ii,jj))
#endif
    enddo
  enddo

  call h5screate_simple_f(rank, countm, memspace, error)
  call h5sselect_hyperslab_f(memspace, H5S_SELECT_SET_F, offsetm, countm, error)

  call h5dopen_f(file_id, 'matrix', dset_id, error)
  call h5dget_space_f(dset_id,filespace,error)

  call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, offset, count, error, stride, block)

! Add in remainders, in case scal%nbl doesnt perfectly divide nmtx

! Bottom Rows
  rowremainder = mod(scal%npr,scal%nbl)
  if (rowremainder .ne. 0) then
    offsetr=offset
    countr=count
    blockr=block
    strider=stride
    offsetr(3)=nmtx-rowremainder
    countr(3)=rowremainder
    blockr(3)=1
    strider(3)=1
    call h5sselect_hyperslab_f(filespace, H5S_SELECT_OR_F, offsetr, countr, error, strider, blockr)
    !write(6,*) peinf%inode, "I have the bottom row", rowremainder, scal%npc
  endif

! Right Columns
  colremainder = mod(scal%npc,scal%nbl)
  if (colremainder .ne. 0) then
    offsetr=offset
    countr=count
    blockr=block
    strider=stride
    offsetr(4)=nmtx-colremainder
    countr(4)=colremainder
    blockr(4)=1
    strider(4)=1
    call h5sselect_hyperslab_f(filespace, H5S_SELECT_OR_F, offsetr, countr, error, strider, blockr)
    !write(6,*) peinf%inode, "I have the right column", colremainder, scal%npr
! Bottom Corner of Matrix
    if (rowremainder .ne. 0) then
      offsetr=offset
      countr=count
      blockr=block
      strider=stride
      offsetr(3)=nmtx-rowremainder
      countr(3)=rowremainder
      blockr(3)=1
      strider(3)=1
      offsetr(4)=nmtx-colremainder
      countr(4)=colremainder
      blockr(4)=1
      strider(4)=1
      call h5sselect_hyperslab_f(filespace, H5S_SELECT_OR_F, offsetr, countr, error, strider, blockr)
      !write(6,*) peinf%inode, "I have bottom both" 
    endif
  endif

  call h5pcreate_f(H5P_DATASET_XFER_F, plist_id, error)
  !call h5pset_dxpl_mpio_f(plist_id, H5FD_MPIO_INDEPENDENT_F, error)
  call h5pset_dxpl_mpio_f(plist_id, H5FD_MPIO_COLLECTIVE_F, error)
  if(peinf%inode.eq.0) call timacc(47,2)
  if(peinf%inode.eq.0) call timacc(48,1)
  call h5dwrite_f(dset_id, H5T_NATIVE_DOUBLE, data, countm, error, memspace, filespace, &
                      xfer_prp = plist_id)
  if(peinf%inode.eq.0) call timacc(48,2)
  call h5pclose_f(plist_id, error)

  SAFE_DEALLOCATE(data)
  call h5dclose_f(dset_id, error)
  call h5sclose_f(memspace, error)
  call h5sclose_f(filespace, error)
  call h5fclose_f(file_id, error)

  POP_SUB(write_matrix_d_par_hdf)
  
  return
end subroutine write_matrix_d_par_hdf

subroutine write_matrix_f_par_hdf(scal,nfreq,retarded,ONLYIFCPLX(advanced)nmtx,iq,is,name)
  type(scalapack), intent(in) :: scal
  integer, intent(in) :: nfreq
  complex(DPC), intent(in) :: retarded(:,:,:) !< (nfreq,scal%npr,scal%npc)
#ifdef CPLX
  complex(DPC), intent(in) :: advanced(:,:,:) !< (nfreq,scal%npr,scal%npc)
#endif
  integer, intent(in) :: nmtx
  integer, intent(in) :: iq
  integer, intent(in) :: is
  character(len=*), intent(in) :: name

  integer :: ii, jj, error, rank
  real(DP), allocatable :: data(:,:,:,:,:,:)

  integer(HID_T) :: file_id       ! File identifier
  integer(HID_T) :: dset_id       ! Dataset identifier
  integer(HID_T) :: filespace     ! Dataspace identifier in file
  integer(HID_T) :: memspace      ! Dataspace identifier in mem
  integer(HID_T) :: plist_id      ! Property list identifier for parallel IO

  integer(HSIZE_T) :: count(6), countm(6), offset(6), offsetm(6), stride(6), block(6)
  integer(HSIZE_T) :: countr(6), offsetr(6), strider(6), blockr(6)

  integer :: comm, info, rowremainder, colremainder

  PUSH_SUB(write_matrix_f_par_hdf)

  if(peinf%inode.eq.0) call timacc(47,1)

! JRD: We need a barrier here or else parallel file opening gets mixed up with
! peinf%inode 0 opening the file to write the diagonal (which is called first).
  call MPI_barrier(MPI_COMM_WORLD,mpierr)

  comm = MPI_COMM_WORLD
  info = MPI_INFO_NULL

#ifdef VERBOSE
  if(peinf%inode .eq. 0) then
    write(6,*) ' Writing matrix: ', nmtx
    write(6,*) ' '
  endif
#endif

! JRD Should be ok with npr and npc = 0

  rank=6
  countm(1) = 2
  countm(2) = nfreq
  countm(3) = scal%npr
  countm(4) = scal%npc
  countm(5) = SCALARSIZE
  countm(6) = 1

  offsetm(:) = 0

  count(1) = 1
  count(2) = 1
  count(3) = scal%npr/scal%nbl
  count(4) = scal%npc/scal%nbl
  count(5) = 1
  count(6) = 1

  block(1) = 2
  block(2) = nfreq
  block(3) = scal%nbl
  block(4) = scal%nbl
  block(5) = SCALARSIZE
  block(6) = 1

  offset(1) = 0
  offset(2) = 0
  offset(3) = scal%myprow*scal%nbl
  offset(4) = scal%mypcol*scal%nbl
  offset(5) = SCALARSIZE*(is-1)
  offset(6) = iq-1

  stride(1) = 1
  stride(2) = 1
  stride(3) = scal%nprow*scal%nbl
  stride(4) = scal%npcol*scal%nbl
  stride(5) = 1
  stride(6) = 1

  call h5pcreate_f(H5P_FILE_ACCESS_F, plist_id, error)
  call h5pset_fapl_mpio_f(plist_id, comm, info, error)
  call h5fopen_f(trim(name), H5F_ACC_RDWR_F, file_id, error, access_prp = plist_id)
  call h5pclose_f(plist_id,error)

  SAFE_ALLOCATE(data,(countm(1),countm(2),countm(3),countm(4),countm(5),countm(6)))
!XXX create data can we avoid duplication?
!XXX THREAD?
  do jj = 1, scal%npc
    do ii = 1, scal%npr
        data(1,:,ii,jj,1,1) = dble(retarded(:,ii,jj))
        data(2,:,ii,jj,1,1) = IMAG(retarded(:,ii,jj))
#ifdef CPLX
        data(1,:,ii,jj,2,1) = dble(advanced(:,ii,jj))
        data(2,:,ii,jj,2,1) = IMAG(advanced(:,ii,jj))
#endif
    enddo
  enddo

  call h5screate_simple_f(rank, countm, memspace, error)
  call h5sselect_hyperslab_f(memspace, H5S_SELECT_SET_F, offsetm, countm, error)

  call h5dopen_f(file_id, 'matrix', dset_id, error)
  call h5dget_space_f(dset_id,filespace,error)

  call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, offset, count, error, stride, block)

! Add in remainders, in case scal%nbl doesnt perfectly divide nmtx

! Bottom Rows
  rowremainder = mod(scal%npr,scal%nbl)
  if (rowremainder .ne. 0) then
    offsetr=offset
    countr=count
    blockr=block
    strider=stride
    offsetr(3)=nmtx-rowremainder
    countr(3)=rowremainder
    blockr(3)=1
    strider(3)=1
    call h5sselect_hyperslab_f(filespace, H5S_SELECT_OR_F, offsetr, countr, error, strider, blockr)
    !write(6,*) peinf%inode, "I have the bottom row", rowremainder, scal%npc
  endif

! Right Columns
  colremainder = mod(scal%npc,scal%nbl)
  if (colremainder .ne. 0) then
    offsetr=offset
    countr=count
    blockr=block
    strider=stride
    offsetr(4)=nmtx-colremainder
    countr(4)=colremainder
    blockr(4)=1
    strider(4)=1
    call h5sselect_hyperslab_f(filespace, H5S_SELECT_OR_F, offsetr, countr, error, strider, blockr)
    !write(6,*) peinf%inode, "I have the right column", colremainder, scal%npr
! Bottom Corner of Matrix
    if (rowremainder .ne. 0) then
      offsetr=offset
      countr=count
      blockr=block
      strider=stride
      offsetr(3)=nmtx-rowremainder
      countr(3)=rowremainder
      blockr(3)=1
      strider(3)=1
      offsetr(4)=nmtx-colremainder
      countr(4)=colremainder
      blockr(4)=1
      strider(4)=1
      call h5sselect_hyperslab_f(filespace, H5S_SELECT_OR_F, offsetr, countr, error, strider, blockr)
      !write(6,*) peinf%inode, "I have bottom both" 
    endif
  endif

  call h5pcreate_f(H5P_DATASET_XFER_F, plist_id, error)
  !call h5pset_dxpl_mpio_f(plist_id, H5FD_MPIO_INDEPENDENT_F, error)
  call h5pset_dxpl_mpio_f(plist_id, H5FD_MPIO_COLLECTIVE_F, error)
  if(peinf%inode.eq.0) call timacc(47,2)
  if(peinf%inode.eq.0) call timacc(48,1)
  call h5dwrite_f(dset_id, H5T_NATIVE_DOUBLE, data, countm, error, memspace, filespace, &
                      xfer_prp = plist_id)
  if(peinf%inode.eq.0) call timacc(48,2)
  call h5pclose_f(plist_id, error)

  SAFE_DEALLOCATE(data)
  call h5dclose_f(dset_id, error)
  call h5sclose_f(memspace, error)
  call h5sclose_f(filespace, error)
  call h5fclose_f(file_id, error)

  POP_SUB(write_matrix_f_par_hdf)
  
  return

end subroutine write_matrix_f_par_hdf

#endif


!==========================================================================================

subroutine write_matrix_f_hdf(scal,nfreq,retarded,ONLYIFCPLX(advanced)nmtx,iq,is,name)
  type(scalapack), intent(in) :: scal
  integer, intent(in) :: nfreq
  complex(DPC), intent(in) :: retarded(:,:,:) !< (nfreq,scal%npr,scal%npc)
#ifdef CPLX
  complex(DPC), intent(in) :: advanced(:,:,:) !< (nfreq,scal%npr,scal%npc)
#endif
  integer, intent(in) :: nmtx
  integer, intent(in) :: iq
  integer, intent(in) :: is
  character(len=*), intent(in) :: name

  integer :: ii, jj, error, size, rank
#ifdef USESCALAPACK
  real(DP), allocatable :: datatmp(:,:,:,:,:,:)
  integer :: irow, icol, irowm, icolm
  integer :: icurr
#endif
  real(DP), allocatable :: data(:,:,:,:,:,:)
  type(progress_info) :: prog_info !< a user-friendly progress report

  integer(HID_T) :: file_id       ! File identifier
  integer(HID_T) :: dset_id       ! Dataset identifier
  integer(HID_T) :: filespace     ! Dataspace identifier in file
  integer(HID_T) :: memspace      ! Dataspace identifier in mem

  integer(HSIZE_T) :: count(6), offset(6), offsetm(6)

  PUSH_SUB(write_matrix_f_hdf)

#ifdef VERBOSE
  if(peinf%inode .eq. 0) then
    write(6,*) ' Writing matrix: ', nmtx, nfreq
    write(6,*) ' '
  endif
#endif

! XXX: For now, we will still have only proc 0 write...
! We should changes this to parallel writes. But doing
! this effectively from the scalapack, block cyclic layout
! seems a bit tricky. So, ignoring for now...

  rank=6
  count(1) = 2
  count(2) = nfreq
  count(3) = nmtx
  count(4) = 1
  count(5) = SCALARSIZE
  count(6) = 1

  if (peinf%inode .eq. 0) then
    SAFE_ALLOCATE(data, (count(1),count(2),count(3),count(4),count(5),count(6)))
    call h5fopen_f(trim(name), H5F_ACC_RDWR_F, file_id, error)
    call h5dopen_f(file_id, 'matrix', dset_id, error)
    call h5screate_simple_f(rank, count, memspace, error)
    call h5dget_space_f(dset_id,filespace,error)
  endif

#ifdef USESCALAPACK
  SAFE_ALLOCATE(datatmp, (count(1),count(2),count(3),count(4),count(5),count(6)))
  icurr=0
#endif  

  call progress_init(prog_info, 'writing matrix', 'column', nmtx)
  do jj = 1, nmtx
    call progress_step(prog_info, jj)

#ifdef USESCALAPACK

!    if(peinf%inode.eq.0) call timacc(51,1)

    icol=MOD(INT(((jj-1)/scal%nbl)+TOL_SMALL),scal%npcol)
    datatmp=0d0
    if (icol .eq. scal%mypcol) then
      do ii = 1, nmtx
        irow=MOD(INT(((ii-1)/scal%nbl)+TOL_SMALL),scal%nprow)
        if (irow .eq. scal%myprow) then
          icurr=icurr+1
          icolm=INT((icurr-1)/scal%npr+TOL_SMALL)+1
          irowm=MOD((icurr-1),scal%npr)+1
          datatmp(1,:,ii,1,1,1)=dble(retarded(:,irowm,icolm))
          datatmp(2,:,ii,1,1,1)=IMAG(retarded(:,irowm,icolm))
#ifdef CPLX
          datatmp(1,:,ii,1,2,1)=dble(advanced(:,irowm,icolm))
          datatmp(2,:,ii,1,2,1)=IMAG(advanced(:,irowm,icolm))
#endif
        endif
      enddo
    endif
    if (peinf%inode .eq. 0) then
      data=0d0
    endif
! XXX This is a big waste of communication. Should be fixed when do
! parallel IO.

    size = nmtx*nfreq*2*SCALARSIZE

!    if(peinf%inode.eq.0) call timacc(51,2)
!    if(peinf%inode.eq.0) call timacc(52,1)

    call MPI_REDUCE(datatmp,data,size,MPI_DOUBLE_PRECISION,MPI_SUM,0, &
      MPI_COMM_WORLD,mpierr)

!    if(peinf%inode.eq.0) call timacc(52,2)

#else

    if (peinf%inode .eq. 0) then
      do ii = 1, nmtx
          data(1,:,ii,1,1,1)=dble(retarded(:,ii,jj))
          data(2,:,ii,1,1,1)=IMAG(retarded(:,ii,jj))
#ifdef CPLX
          data(1,:,ii,1,2,1)=dble(advanced(:,ii,jj))
          data(2,:,ii,1,2,1)=IMAG(advanced(:,ii,jj))
#endif
      enddo
    endif

#endif

!    if(peinf%inode.eq.0) call timacc(53,1)

    if (peinf%inode .eq. 0) then

      offsetm(:) = 0
      call h5sselect_hyperslab_f(memspace, H5S_SELECT_SET_F, offsetm, count, error)

      offset(1)=0
      offset(2)=0
      offset(3)=0
      offset(4)=jj-1
      offset(5)=SCALARSIZE*(is-1)
      offset(6)=iq-1

      call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, offset, count, error)

      call h5dwrite_f(dset_id, H5T_NATIVE_DOUBLE, data, count, error, memspace, filespace)

    endif

!    if(peinf%inode.eq.0) call timacc(53,2)
!    if(peinf%inode.eq.0) call timacc(54,1)

#ifdef USESCALAPACK
    call MPI_barrier(MPI_COMM_WORLD,mpierr)
#endif    

!    if(peinf%inode.eq.0) call timacc(54,2)


  enddo
  call progress_free(prog_info)

#ifdef USESCALAPACK
  SAFE_DEALLOCATE(datatmp)
#endif  

  if (peinf%inode .eq. 0) then
    SAFE_DEALLOCATE(data)
  endif

  if (peinf%inode .eq. 0) then
    call h5dclose_f(dset_id, error)
    call h5sclose_f(memspace, error)
    call h5sclose_f(filespace, error)
    call h5fclose_f(file_id, error)
  endif  

  POP_SUB(write_matrix_f_hdf)
  
  return
end subroutine write_matrix_f_hdf

!===================================================================================

subroutine write_gvec_indices_hdf(ng,isrtx,isorti,ekin,iq,name)
  integer, intent(in) :: ng
  integer, intent(in) :: isrtx(:) !< (ng)
  integer, intent(in) :: isorti(:) !< (ng)
  real(DP), intent(in) :: ekin(:) !< (ng)
  integer, intent(in) :: iq
  character(len=*), intent(in) :: name

  integer(HID_T) :: file_id       ! File identifier
  integer(HID_T) :: dset_id       ! Dataset identifier
  integer(HID_T) :: filespace     ! Dataspace identifier in file
  integer(HID_T) :: memspace      ! Dataspace identifier in mem

  integer(HSIZE_T) :: adims(3), edims(2), aoffset(3), eoffset(2), aoffsetm(3), eoffsetm(2)

  integer :: error, rank
  integer, allocatable :: sortarrays(:,:,:)
  real(DP), allocatable :: ekinarray(:,:)

  PUSH_SUB(write_gvec_indices_hdf)

  call h5fopen_f(trim(name), H5F_ACC_RDWR_F, file_id, error)

  SAFE_ALLOCATE(sortarrays,(ng,2,1))

! Write Arrays

  rank = 3
  adims(1) = ng
  adims(2) = 2
  adims(3) = 1
  aoffset(1) = 0
  aoffset(2) = 0
  aoffset(3) = iq - 1
  aoffsetm(:) = 0
  sortarrays(1:ng,1,1) = isrtx(1:ng)
  sortarrays(1:ng,2,1) = isorti(1:ng)
  call h5dopen_f(file_id, 'q-gvec-index', dset_id, error)
  call h5screate_simple_f(rank, adims, memspace, error)
  call h5dget_space_f(dset_id,filespace,error)
  call h5sselect_hyperslab_f(memspace, H5S_SELECT_SET_F, aoffsetm, adims, error)
  call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, aoffset, adims, error)
  call h5dwrite_f(dset_id, H5T_NATIVE_INTEGER, sortarrays, adims, error, memspace, filespace)
  call h5dclose_f(dset_id, error)
  call h5sclose_f(memspace, error)
  call h5sclose_f(filespace, error)

  SAFE_DEALLOCATE(sortarrays)
  SAFE_ALLOCATE(ekinarray,(ng,1))

  rank = 2
  edims(1) = ng
  edims(2) = 1
  eoffset(1) = 0
  eoffset(2) = iq - 1
  eoffsetm(:) = 0
  ekinarray(1:ng,1) = ekin(1:ng)
  call h5dopen_f(file_id, 'q-gvec-ekin', dset_id, error)
  call h5screate_simple_f(rank, edims, memspace, error)
  call h5dget_space_f(dset_id,filespace,error)
  call h5sselect_hyperslab_f(memspace, H5S_SELECT_SET_F, eoffsetm, edims, error)
  call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, eoffset, edims, error)
  call h5dwrite_f(dset_id, H5T_NATIVE_DOUBLE, ekinarray, edims, error, memspace, filespace)
  call h5dclose_f(dset_id, error)
  call h5sclose_f(memspace, error)
  call h5sclose_f(filespace, error)

  SAFE_DEALLOCATE(ekinarray)

  call h5fclose_f(file_id, error)

  POP_SUB(write_gvec_indices_hdf)

end subroutine write_gvec_indices_hdf

#endif

end module write_matrix_m

