Logo Search packages:      
Sourcecode: abinit version File versions  Download package

cgwf_htor.F90

!{\src2tex{textront=tt}}
!!****f* ABINIT/cgwf_htor
!! NAME
!! cgwf_htor
!!
!! FUNCTION
!! Update all wavefunction |C>, non self-consistently.
!! also compute the corresponding H|C> and Vnl|C> (and S|C> if paw).
!! Uses a conjugate-gradient algorithm.
!! In case of paw, resolves a generalized eigenproblem using an
!!  overlap matrix (not used for norm conserving psps).
!!
!! COPYRIGHT
!! Copyright (C) 1998-2007 ABINIT group (DCA, XG, GMR, MT)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  berryopt == 4: electric field is on; berryopt /= 4: electric field is off
!!  chkexit= if non-zero, check whether the user wishes to exit
!!  cpus = CPU time limit
!!  dimffnl=second dimension of ffnl (1+number of derivatives)
!!  ffnl(npw,dimffnl,lmnmax,ntypat)=nonlocal form factors on basis sphere.
!!  filnam_ds1=name of input file (used for exit checking)
!!  filstat=name of the status file
!!  gs_hamk <type(gs_hamiltonian_type)>=all data for the Hamiltonian at k
!!  icg=shift to be applied on the location of data in the array cg
!!  ikg=shift to be given to the location of the data in the arrays kg
!!      and pwind
!!  ikpt=number of the k-point
!!  inonsc=index of non self-consistent loop
!!  isppol=spin polarization currently treated
!!  kg_k(3,npw)=coordinates of planewaves in basis sphere.
!!  kinpw(npw)=(modified) kinetic energy for each plane wave (Hartree)
!!  lmnmax=if useylm=1, max number of (l,m,n) comp. over all type of psps
!!        =if useylm=0, max number of (l,n)   comp. over all type of psps
!!  matblk=dimension of the array ph3d
!!  mband =maximum number of bands
!!  mcg=second dimension of the cg array
!!  mcgq=second dimension of the cgq array
!!  mgfft=maximum size of 1D FFTs
!!  mkgq = second dimension of pwnsfacq
!!  mkmem =number of k points which can fit in memory; set to 0 if use disk
!!  mpi_enreg=informations about MPI parallelization
!!  mpsang= 1+maximum angular momentum for nonlocal pseudopotentials
!!  mpssoang= 1+maximum (spin*angular momentum) for nonlocal pseudopotentials
!!  mpw=maximum dimensioned size of npw
!!  natom=number of atoms in cell.
!!  nband=number of bands.
!!  nbdblock=number of bands in a block
!!  nkpt=number of k points
!!  nline=number of line minimizations per band.
!!  npw=number of planewaves in basis sphere at given k.
!!  npwarr(nkpt)=number of planewaves in basis at this k point
!!  nspinor=number of spinorial components of the wavefunctions
!!  nsppol=1 for unpolarized, 2 for spin-polarized
!!  ntypat=number of types of atoms in cell.
!!  nvloc=final dimension of vlocal (usually 1, but 4 for non-collinear
!!  n4,n5,n6 used for dimensionning of vlocal
!!  ortalg=governs the choice of the algorithm for orthogonalisation.
!!  ph3d(2,npw,matblk)=3-dim structure factors, for each atom and plane wave.
!!  prtvol=control print volume and debugging output
!!  pwind(pwind_alloc,2,3) = array used to compute
!!           the overlap matrix smat between k-points (see initberry.f)
!!  pwind_alloc = first dimension of pwind
!!  pwnsfac(2,pwind_alloc) = phase factors for non-symmorphic translations
!!                           (see initberry.f)
!!  pwnsfacq(2,mkgq) = phase factors for the nearest neighbours of the
!!                     current k-point (electric field, MPI //)
!!  tolwfr=tolerance on largest wf residual
!!  use_subovl=1 if the overlap matrix is not identity in WFs subspace
!!  vlocal(n4,n5,n6,nvloc)= local potential in real space, on the augmented fft grid
!!  wfoptalg=govern the choice of algorithm for wf optimisation
!!   (0, 1, 10 and 11 : in the present routine, usual CG algorithm ;
!!   (2 and 3 : use shifted square Hamiltonian)
!!  wtk=weight associated with current k point.
!!  zshift(nband)=in case wfoptalg is 2 or 3, shift of the Hamiltonian
!!
!! OUTPUT
!!  dphase_k(3) = change in Zak phase for the current k-point in case berryopt = 4 (electric field)
!!  resid(nband)=wf residual for new states=|(H-e)|C>|^2 (hartree^2)
!!  subham(nband*(nband+1))=Hamiltonian expressed in sthe WFs subspace
!!  subovl(nband*(nband+1)*use_subovl)=overlap matrix expressed in sthe WFs subspace
!!  if(gs_hamk%usepaw==1)
!!   gsc(2,mgsc)=<G|S|C band,k> coefficients for ALL bands
!!               where S is the overlap matrix (used only for paw)
!!   subvnl(nband*(nband+1)*(1-gs_hamk%usepaw))=non-local Hamiltonian expressed in sthe WFs subspace
!!
!! SIDE EFFECTS
!!  cg(2,mcg)
!!    at input =wavefunction <G|C band,k> coefficients for ALL bands
!!    at output same as input except that
!!      the current band, with number 'band' has been updated
!!  dtefield <type(efield_type)> = variables related to Berry phase
!!      calculations (see initberry.f)
!!  quit= if 1, proceeds to smooth ending of the job.
!!
!! NOTES
!!  cg should not be filtered and normalized : it should already
!!   be OK at input !
!!  Not sure that that the generalized eigenproblem (when gs_hamk%usepaw=1)
!!   is compatible with wfoptalg=2 or 3 (use of shifted square
!!   Hamiltonian) - to be verified
!!
!! PARENTS
!!      vtowfk_htor
!!
!! CHILDREN
!!      indfftrisc,leave_new,mpi_allgather,mpi_allreduce,nonlop_htor,sg_fftrisc
!!      sg_fftrisc_htor,status,timab,wrtout,zaxpy,zcopy,zdscal
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

subroutine cgwf_htor(berryopt,cg,cgq,chkexit,cpus,dimffnl,dphase_k,dtefield,&
&                ffnl,filnam_ds1,filstat,&
&                gsc,gs_hamk,icg,igsc,ikg,ikpt,inonsc,&
&                 isppol,kg_k,kinpw,lmnmax,matblk,mband,&
&                mcg,mcgq,mgfft,mgsc,mkgq,mkmem,mpi_enreg,mpsang,&
&                mpssoang,mpw,natom,nband,nbdblock,nkpt,nline,npw,npwarr,nspinor,&
&                nsppol,ntypat,nvloc,n4,n5,n6,ortalg,&
&                ph3d,prtvol,pwind,pwind_alloc,pwnsfac,&
&                pwnsfacq,quit,resid,subham,subovl,subvnl,tolwfr,&
&                use_subovl,vlocal,wfoptalg,wtk,zshift)

use defs_basis
use defs_datatypes

#ifndef HAVE_FORTRAN_INTERFACES
use defs_xfuncmpi
#endif

!This section has been created automatically by the script Abilint (TD). Do not modify these by hand.
#ifdef HAVE_FORTRAN_INTERFACES
 use interfaces_01manage_mpi
 use interfaces_12ffts
 use interfaces_18seqpar, except_this_one => cgwf_htor
#endif
!End of the abilint section

 implicit none

#if defined MPI
 include 'mpif.h'
#endif

!Arguments ------------------------------------
integer,intent(in) :: berryopt,chkexit,dimffnl,icg,igsc,ikg,ikpt,inonsc,isppol,lmnmax,matblk
integer,intent(in) :: mband,mcg,mcgq,mgfft,mgsc,mkgq,mkmem,mpsang,mpssoang,mpw,n4
integer,intent(in) :: n5,n6,natom,nband,nbdblock,nkpt,nline,npw,nspinor,nsppol,ntypat
integer,intent(in) :: nvloc,ortalg,prtvol,pwind_alloc,use_subovl,wfoptalg
integer,intent(inout) :: quit
real(dp),intent(in) :: cpus,tolwfr,wtk
character(len=fnlen),intent(in) :: filnam_ds1,filstat
type(MPI_type),intent(inout) :: mpi_enreg
type(efield_type),intent(inout) :: dtefield
type(gs_hamiltonian_type),intent(in) :: gs_hamk
integer,intent(in) :: kg_k(3,npw),npwarr(nkpt),pwind(pwind_alloc,2,3)
real(dp),intent(in) :: cgq(2,mcgq),ffnl(npw,dimffnl,lmnmax,ntypat),kinpw(npw)
real(dp),intent(in) :: pwnsfac(2,pwind_alloc),pwnsfacq(2,mkgq),zshift(nband)
real(dp),intent(inout) :: cg(2,mcg)
real(dp),intent(inout) :: gsc(2,mgsc)
real(dp),intent(inout) :: ph3d(2,npw,matblk),vlocal(n4,n5,n6,nvloc)
real(dp),intent(inout) :: dphase_k(3)
real(dp),intent(out) :: subham(nband*(nband+1))
real(dp),intent(inout) :: subovl(nband*(nband+1)*use_subovl)
real(dp),intent(out) :: subvnl(nband*(nband+1)*(1-gs_hamk%usepaw))
real(dp),intent(out) :: resid(nband)

!Local variables-------------------------------
integer,parameter :: level=9,tim_getghc=1
integer,save :: nskip=0
integer :: choice,counter,ddkflag,iband,ibandmax
integer :: iblock,icg1,idir,idum1,idum2,ierr,iexit,ifor,igs,ii,ikgf
integer :: ikpt2,ikpt2f,ikptf,iline,index,ipw,ipw1,isign,ispinor,istwf_k,isubh,isubo,itrs,jband
integer :: job,lowband,mcg_q,nbdblock_eff,nblock,nnlout,npw_k2,old_paral_level,openexit
integer :: optekin,outofplace,paw_opt,print,shiftbd,signs,sij_opt,spaceComm
integer :: use_vnl,useoverlap,igspinor
real(dp) :: chc,costh,deltae,deold,dhc,dhd,diff,dotgg,dotgp,doti,dotr
real(dp) :: dphase_aux2,dvnlc,e0,e0_old,e1,e1_old,fac,factor,ghc2
real(dp) :: lam0,lamold,phase0,phase_min,root,sinth,swap,tan2th,theta,thetam
character(len=500) :: message
integer :: hel(2,3)
integer,allocatable :: pwind_k(:),sflag_k(:)
real(dp) :: bcut(2,3),dphase_aux1(3),dtm_k(2),enlout(1),phase_end(3)
real(dp) :: phase_init(3),tsec(2)
real(dp),allocatable :: cg1_k(:,:),cgq_k(:,:),conjgr(:,:),cwavef(:,:)
real(dp),allocatable :: detovc(:,:,:),detovd(:,:,:),direc(:,:),direc_tmp(:,:),g_dummy(:,:,:),g_dummy_block(:,:,:,:)
real(dp),allocatable :: gcc(:,:),gcc_block(:,:,:)
real(dp),allocatable :: gh_direc(:,:),gh_direcws(:,:),ghc(:,:),ghc_block(:,:,:),ghcws(:,:)
real(dp),allocatable :: grad_berry(:,:),grad_total(:,:),gs_direc(:,:)
real(dp),allocatable :: gscc(:,:),gscc_block(:,:,:),gsc_dummy(:,:)
real(dp),allocatable :: gvnlc(:,:),gvnlc_block(:,:,:),gvnl_direc(:,:),gvnl_dummy(:,:)
real(dp),allocatable :: pwnsfac_k(:,:),scprod(:,:),scwavef(:,:)
real(dp),allocatable :: smat_inv(:,:,:),smat_k(:,:,:),swork(:,:),vresid(:,:),work(:,:,:,:)
real(dp),allocatable :: sub_tmp(:)

!no_abirules
! htor
real(dp), external :: DZNRM2, DDOT
double complex, external :: ZDOTU, ZDOTC
double complex, allocatable:: zdotarr(:)
double complex :: zdotures, chccmplx, cvccmplx
real(dp) :: cg_im, cg_re, ar, ai, nonlop_dum(1,1), chcre, chcim, cvcre, cvcim, cgreipw, cgimipw
real(dp),allocatable :: kpg_dum(:,:), cgbuf(:,:)
integer :: iiband, nkpg, iwavef, ig, gmin, gmax, gnpw, gmpw, jpw, ngb, blkgb, igbmin, igbmax, jgb
integer :: iproc, igb
real(dp) :: ek0,ek0_inv,poly,xx,t0=0.0_dp ,t1=0.0_dp ,tstart=0.0_dp ,tend=0.0_dp
real(dp),save :: tfftp=0.0_dp, tffts=0.0_dp ! time for parallel and serial fft
integer,save :: fft=0 ! serial fft=1, parallel fft=0
integer,allocatable :: indpw_k(:,:), gdist(:), gnum(:), ngbproc(:), packin(:), packout(:,:), unpackin(:,:), unpackout(:)
integer :: maxgbin, maxgbout, iiproc, ipwmin, ipwmax, i3

! **********************************************************************

#ifdef __VMS
!DEC$ ATTRIBUTES ALIAS:'ZDOTC' :: zdotc
!DEC$ ATTRIBUTES ALIAS:'DDOT' :: ddot
!DEC$ ATTRIBUTES ALIAS:'ZAXPY' :: zaxpy
!DEC$ ATTRIBUTES ALIAS:'ZCOPY' :: zcopy
!DEC$ ATTRIBUTES ALIAS:'ZDSCAL' :: zdscal
#endif

!DEBUG
!write(6,*)' cgwf_htor : enter '
!ENDDEBUG

!======================================================================
!========= LOCAL VARIABLES DEFINITIONS AND ALLOCATIONS ================
!======================================================================

!Starting the routine
call timab(22,1,tsec)
if(prtvol<0) call status(0,filstat,iexit,level,'enter cgwf    ')


!Structured debugging if prtvol==-level
if(prtvol==-level)then
 write(message,'(80a,a,a)') ('=',ii=1,80),ch10,' cgwf : enter '
 call wrtout(06,message,'COLL')
end if

! Ops for no PAW
useoverlap=0
use_vnl=1

!Initializations and allocations
isubh=1;isubo=1
istwf_k=gs_hamk%istwf_k
print=0;if(prtvol==-level)print=1

#if defined MPI
gmin=mpi_enreg%gmin
gmax=mpi_enreg%gmax
gnpw=gmax-gmin+1
gmpw=mpi_enreg%mgblk
#else
gmin=1
gmax=npw
gnpw=gmax-gmin+1
gmpw=npw
#endif

optekin=0;

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Playing with index shuffling ...
allocate(indpw_k(4,npw))
allocate(gdist(npw))
allocate(gnum(mpi_enreg%gngroup))
allocate(ngbproc(mpi_enreg%gngroup))
call indfftrisc(gs_hamk%gbound(3:3+2*mgfft+4,1),indpw_k,kg_k,mgfft,ngb,gs_hamk%ngfft,npw)
blkgb=ngb/mpi_enreg%gngroup+1

!gdist(:)=-1
!gnum(:)=0
!do iproc=0,mpi_enreg%gngroup-1
! ! for each processor lower and upper bounds of igb
! igbmin=iproc*blkgb+1
! igbmax=min((iproc+1)*blkgb,ngb)
! if(mpi_enreg%me==0) write(*,*) 'iproc: ', iproc, 'ngb: ', igbmax-igbmin
! do igb=igbmin,igbmax ! for every line iproc needs
!  do ipw=1,npw ! every npw iiproc has
!   if(indpw_k(4,ipw)==igb) then ! if iproc needs it -> mark it
!    if(gdist(ipw)==-1) then
!     gdist(ipw) = iproc
!     gnum(iproc+1)=gnum(iproc+1)+1
!    else
!     write(*,*) 'BUG: ipw=', ipw,' is already associated with proc', gdist(ipw)
!     stop
!    end if
!   end if
!  end do
! end do
!end do
!! gdist(ipw) gives the processor where ipw is needed for FFT
!! gnum(iproc) gives the number of pws for iproc
!if(mpi_enreg%me==0) write(*,*) gnum(:)

!!!!!!!!!!!!!!!!!
! get the maxgbin, sendcounts and recvcounts for sg_fftrisc_htor :)
! the iiproc loop is to get the maximum of ngbproc across all processors!
! (this saves an ALLREDUCE :)
ngbproc(:)=0
maxgbin=0;
do iiproc=0,mpi_enreg%gngroup-1
 ipwmin=iiproc*mpi_enreg%mgblk+1
 ipwmax=min((iiproc+1)*mpi_enreg%mgblk,npw)
 do iproc=0,mpi_enreg%gngroup-1
  ! for each processor lower and upper bounds of igb
  igbmin=iproc*blkgb+1
  igbmax=min((iproc+1)*blkgb,ngb)
  do igb=igbmin,igbmax ! for every line iproc needs
   do ipw=ipwmin,ipwmax ! every npw iiproc has
    if(indpw_k(4,ipw)==igb) then ! if iproc needs it from iiproc -> count it
     ngbproc(iproc+1)=ngbproc(iproc+1)+1
    end if
   end do
  end do
  ! if iiproc sends to me (I have to receive from him): save it as rcount
!  if (iproc==mpi_enreg%gindex) rcounts(iiproc+1)=ngbproc(iproc+1)
 end do
 ! if this ngbproc belongs to (I send as iiproc): me save it as scount
! if (iiproc==mpi_enreg%gindex) scounts(:)=ngbproc(:)
 ! the maximum number of gb's over all iiproc - size for alltoall :)
 if (maxgbin<maxval(ngbproc)) maxgbin=maxval(ngbproc)
end do
! don't send (copy) to myself!
!rcounts(mpi_enreg%gindex+1)=0
!scounts(mpi_enreg%gindex+1)=0
!!!!!!!!!!!!

!!!!!!!!!!!!
! get the maxgbout, sendcounts and recvcounts for sg_fftrisc_htor :)
! get number of xy-lines for each processor
! this whole loop is just to get the datasize for the ALLTOALL
! it is saved in maxgb afterwards ...
ngbproc(:) = 0
maxgbout = 0
do iiproc=0,mpi_enreg%gngroup-1
 ! for each processor lower and upper bounds of igb
 igbmin=iiproc*blkgb+1
 igbmax=min((iiproc+1)*blkgb,ngb)
 do iproc=0,mpi_enreg%gngroup-1
  ipwmin=iproc*mpi_enreg%mgblk+1
  ipwmax=min((iproc+1)*mpi_enreg%mgblk,npw)
  do igb=igbmin,igbmax ! for every line iiproc has
   do ipw=ipwmin,ipwmax ! every npw iproc needs
    if(indpw_k(4,ipw)==igb) then ! if iproc needs it -> count it
     ngbproc(iproc+1)=ngbproc(iproc+1)+1
    end if
   end do
  end do
  ! if iiproc sends to me (I have to receive from him): save it as rcount
!  if (iproc==mpi_enreg%gindex) rcounts(iiproc+1)=ngbproc(iproc+1)
 end do
 ! if this ngbproc belongs to me (I send as iiproc):  save it as scount
! if (iiproc==mpi_enreg%gindex) scounts(:)=ngbproc(:)
 ! the maximum number of gb's over all iiproc - size for alltoall :)
 if (maxgbout<maxval(ngbproc)) maxgbout=maxval(ngbproc)
end do
! don't send (copy) to myself!
!rcounts(mpi_enreg%gindex+1)=0
!scounts(mpi_enreg%gindex+1)=0
!!!!!!!!!!

!!!!!!!! fill packin array :)
allocate(packin(gnpw))
packin(:)=0
! fill packin array
ipwmin=gmin
ipwmax=gmax
do iproc=0,mpi_enreg%gngroup-1
 ! for each processor lower and upper bounds of igb
 igbmin=iproc*blkgb+1
 igbmax=min((iproc+1)*blkgb,ngb)
 index=iproc*2*maxgbin+1 ! index of data in a2abuf of iproc
 do igb=igbmin,igbmax ! for every line that iproc needs
  do ipw=ipwmin,ipwmax ! for every ipw I have
   if(indpw_k(4,ipw)==igb) then ! if iproc needs it -> pack it
    ! the gmin..gmax are as 1..gmax-gmin+1 in the fofgin array (it's distributed :)
    jpw=ipw-mpi_enreg%gmin+1
    if(packin(jpw)/=0) then
      write(*,*) 'packin(',jpw,') is already associated with', packin(jpw)
      stop
    end if
    packin(jpw)=index;
    index=index+2
   end if
  end do
 end do
end do
!!!!!!!!!!!!!!!!!!

!!!!!!!! fill unpackin array :)
allocate(unpackin(blkgb,gs_hamk%ngfft(3)))
unpackin(:,:)=0
igbmin=mpi_enreg%gindex*blkgb+1
igbmax=min((mpi_enreg%gindex+1)*blkgb,ngb)
do iproc=0,mpi_enreg%gngroup-1
 ! for each processor lower and upper bounds of igb
 ipwmin=iproc*mpi_enreg%mgblk+1
 ipwmax=min((iproc+1)*mpi_enreg%mgblk,npw)
 index=iproc*2*maxgbin+1 ! index of data in a2abuf of iproc
 do igb=igbmin,igbmax ! for every line that I need
  do ipw=ipwmin,ipwmax ! for every ipw iproc had (and sent me)
   if(indpw_k(4,ipw)==igb) then ! if I need it -> unpack it
    jgb=igb-igbmin+1
    i3=indpw_k(3,ipw)
    if(unpackin(jgb,i3)/=0) then
      write(*,*) 'unpackin(',jgb,',',i3,') is already associated with', unpackin(jgb,i3)
      stop
    end if
    unpackin(jgb,i3)=index
    index=index+2
   end if
  end do
 end do
end do
!!!!!!!!!!!!!!!!!

!!!!!!!! fill packout array :)
allocate(packout(blkgb,gs_hamk%ngfft(3)))
packout(:,:)=0
igbmin=mpi_enreg%gindex*blkgb+1
igbmax=min((mpi_enreg%gindex+1)*blkgb,ngb)
do iproc=0,mpi_enreg%gngroup-1
 ! for each processor lower and upper bounds of ipw
 ipwmin=iproc*mpi_enreg%mgblk+1
 ipwmax=min((iproc+1)*mpi_enreg%mgblk,npw)
 index=iproc*2*maxgbout+1 ! index of data in a2abuf of iproc
 do igb=igbmin,igbmax ! for every line I have
  do ipw=ipwmin,ipwmax ! for every ipw iproc needs
   if(indpw_k(4,ipw)==igb) then ! if iproc needs it -> pack it
    jgb=igb-igbmin+1
    i3=indpw_k(3,ipw)
    if(packout(jgb,i3)/=0) then
      write(*,*) 'packout(',jgb,i3,') is already associated with', packout(jgb,i3)
      stop
    end if
    packout(jgb,i3)=index
    index=index+2
   end if
  end do
 end do
end do
!!!!!!!!!!!!!!!!!

!!!!!!!! fill unpackout array :)
allocate(unpackout(gnpw))
unpackout(:)=0
ipwmin=mpi_enreg%gmin
ipwmax=mpi_enreg%gmax
do iproc=0,mpi_enreg%gngroup-1
 ! for each processor lower and upper bounds of igb
 igbmin=iproc*blkgb+1
 igbmax=min((iproc+1)*blkgb,ngb)
 index=iproc*2*maxgbout+1 ! index of data in a2abuf of iproc
 do igb=igbmin,igbmax ! for every line that iproc has (and sent)
  do ipw=ipwmin,ipwmax ! for every ipw I need
   if(indpw_k(4,ipw)==igb) then ! if I need it -> unpack it
    jpw=ipw-ipwmin+1
    if(unpackout(jpw)/=0) then
      write(*,*) 'unpackout(',jpw,') is already associated with', unpackout(jpw)
      stop
    end if
    unpackout(jpw)=index
    index=index+2
   end if
  end do
 end do
end do
!!!!!!!!!!!!!!!!!



! now we should reorder the cg and the kinpw array
! and we would have to change nonlop *argh*
deallocate(indpw_k,gdist,gnum,ngbproc)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! /index shuffling

! TODO: some of these allocations are still to big ...
allocate(ghc(2,gmpw*mpi_enreg%gngroup),gvnlc(2,gmpw*mpi_enreg%gngroup))
allocate(conjgr(2,gmpw*mpi_enreg%gngroup))
allocate(cwavef(2,gmpw*mpi_enreg%gngroup))
allocate(direc(2,gmpw*mpi_enreg%gngroup),scprod(2,nband))
allocate(vresid(2,gmpw*mpi_enreg%gngroup))
allocate(gh_direc(2,gmpw*mpi_enreg%gngroup))
allocate(gvnl_direc(2,gmpw*mpi_enreg%gngroup))
allocate(zdotarr(2*nband))
allocate(cgbuf(2,gmpw*mpi_enreg%gngroup))

outofplace=0;

#if defined MPI
           tstart=MPI_Wtime()
#endif

!======================================================================
!====================== LOOP OVER BANDS ===============================
!======================================================================
 do iband=1,nband
  !======================================================================
  !========== INITIALISATION OF MINIMIZATION ITERATIONS =================
  !======================================================================

  ! Tell us what is going on:
  if(prtvol>=10)then
   write(message, '(a,i6,2x,a,i3,a,f5.2,f5.2)' ) ' --- optimizing band',iband,'for',nline,' lines, tfft ',tfftp,tffts
   call  wrtout(06,message,'COLL')
  end if

  dotgp=one

  ! BEGIN Parallel Part - all nodes have the same vectors and cg-Arrays at this
  ! point

  !DEBUG
  cwavef(:,:) = zero
  ! Extraction of the vector that is iteratively updated
  call ZCOPY(gnpw, cg(1,npw*(iband-1)+icg+gmin), 1, cwavef, 1)

  ! this may be not necessary
  ! call sqnorm_g(dotr,istwf_k,mpi_enreg,npw*nspinor,cwavef)
  ! we need only the real part
  ! normalize cwavef
  dotr = DDOT(2*gnpw, cwavef, 1, cwavef, 1)
#if defined MPI && defined MPI_EXT
  call MPI_ALLREDUCE(MPI_IN_PLACE, dotr, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
  &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif
  call ZDSCAL(gnpw, 1._dp/sqrt(abs(dotr)), cwavef, 1)

tfftp=0
  !Compute <g|H|c>
  if(tfftp >= tffts) then ! do serial fft
#if defined MPI
   t0=MPI_WTIME()
   ! gather current band to all processors (serial case)
   call MPI_ALLGATHER(cwavef, 2*gmpw, MPI_DOUBLE_PRECISION, cwavef, 2*gmpw, &
   &                  MPI_DOUBLE_PRECISION, mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif
   call sg_fftrisc(1,vlocal,cwavef,ghc,work,gs_hamk%gbound,gs_hamk%gbound,&
   & gs_hamk%istwf_k,kg_k,kg_k,mgfft,gs_hamk%ngfft,npw,npw,n4,n5,n6,2,1.0_dp)
#if defined MPI
   ! 'scatter' current band
   call ZCOPY(gnpw, cwavef(1,gmin), 1, cwavef, 1)
   call ZCOPY(gnpw, ghc(1,gmin), 1, ghc, 1)
   t1=MPI_WTIME()
#endif
   ! measurement of serial FFT only on the first band
   if(iband==1) then
    tffts=t1-t0
#if defined MPI && defined MPI_EXT
    call MPI_ALLREDUCE(MPI_IN_PLACE, tffts, 1, MPI_DOUBLE_PRECISION, MPI_MAX, &
    &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif
   end if
   fft=0
  else ! do parallel fft
#if defined MPI
           t0=MPI_WTIME()
#endif
   call sg_fftrisc_htor(1,vlocal,cwavef,ghc,work,gs_hamk%gbound,gs_hamk%gbound,&
&   gs_hamk%istwf_k,kg_k,kg_k,mgfft,gs_hamk%ngfft,npw,npw,n4,n5,n6,2,1.0_dp,&
&   mpi_enreg,maxgbin,maxgbout,gnpw,blkgb,packin,unpackin,packout,unpackout)
#if defined MPI
           t1=MPI_WTIME()
#endif
   ! measurement of parallel FFT only on the second band
   if(.true.) then !force measuring
!   if(iband==2) then
    tfftp=t1-t0
#if defined MPI && defined MPI_EXT
    call MPI_ALLREDUCE(MPI_IN_PLACE, tfftp, 1, MPI_DOUBLE_PRECISION, MPI_MAX, &
    &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif
   end if
   fft=1
  end if

  signs=2 ; choice=1 ; nnlout=1 ; idir=0 ; nkpg=0
  paw_opt=gs_hamk%usepaw
  call nonlop_htor(choice,gs_hamk%dimekb1,gs_hamk%dimekb2,dimffnl,dimffnl,&
  &   gs_hamk%ekb,enlout,ffnl,ffnl,gs_hamk%gmet,gs_hamk%gprimd,idir,&
  &   gs_hamk%indlmn,gs_hamk%istwf_k,kg_k,kg_k,kpg_dum,kpg_dum,gs_hamk%kpoint,gs_hamk%kpoint,&
  &   lmnmax,matblk,mgfft,mpi_enreg,mpsang,mpssoang,natom,gs_hamk%nattyp,&
  &   gs_hamk%ngfft,nkpg,nkpg,gs_hamk%nloalg,nnlout,npw,npw,nspinor,ntypat,&
  &   gs_hamk%phkxred,gs_hamk%phkxred,gs_hamk%ph1d,ph3d,ph3d,gs_hamk%pspso,&
  &   signs,gs_hamk%ucvol,cwavef,gvnlc)

  ! sum the potentials up
  do ipw=1,gnpw
   jpw=ipw+gmin-1
   if(kinpw(ipw)<huge(0.0_dp)*1.d-11)then
    ghc(1,ipw)=kinpw(jpw)*cwavef(1,ipw)+ghc(1,ipw)+gvnlc(1,ipw)
    ghc(2,ipw)=kinpw(jpw)*cwavef(2,ipw)+ghc(2,ipw)+gvnlc(2,ipw)
   else
    ghc(1,ipw)=0.0_dp
    ghc(2,ipw)=0.0_dp
   end if
  end do ! ipw

  !======================================================================
  !====== BEGIN LOOP FOR A GIVEN BAND: MINIMIZATION ITERATIONS ==========
  !======================================================================

  if(nline/=0)then
   do iline=1,nline
    !======================================================================
    !================= COMPUTE THE RESIDUAL ===============================
    !======================================================================

    ! Compute lambda = <C|H|C> or <C|(H-zshift)**2|C>
    ! call dotprod_g(chc,doti,istwf_k,mpi_enreg,npw*nspinor,1,cwavef,ghc)
    ! we need only the real part
    chc = DDOT(2*gnpw, cwavef, 1, ghc, 1)
#if defined MPI && defined MPI_EXT
    call MPI_ALLREDUCE(MPI_IN_PLACE, chc, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
    &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif
    lam0 = chc

    ! Check that lam0 is decreasing on succeeding lines:
    if (iline==1) then
     lamold=lam0
    else
     if (lam0>lamold+1.d-12) then
      write(message, '(a,a,a,i8,a,1p,e14.6,a1,3x,a,1p,e14.6,a1)') ' cgwf : WARNING -',ch10, '  New trial energy at line',&
      & iline,' = ',lam0,ch10, '  is higher than former:',lamold,ch10
      call wrtout(06,message,'COLL')
     end if
     lamold=lam0
    end if

    ! Compute residual vector:
    ! Note that vresid is precomputed to garantee cancellation of errors
    ! and allow residuals to reach values as small as 1.0d-24 or better.

    ! vresid = -1*chc*cwavef(:)+ghc(:)
    ! TODO this should be a new blas (y_ = a*x_ + z_)
    call ZCOPY(gnpw, ghc, 1, vresid, 1)
    call ZAXPY(gnpw, dcmplx(-1*chc), cwavef, 1, vresid, 1)

    ! Compute residual (squared) norm
    ! we need only the real part
    resid(iband) = DDOT(2*gnpw, vresid, 1, vresid, 1)
#if defined MPI && defined MPI_EXT
    call MPI_ALLREDUCE(MPI_IN_PLACE, resid(iband), 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
    &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif

    if(prtvol==-level)then
     write(message,'(a,a,i3,2f14.6)') ch10, 'cgwf : iline,eval,resid=',iline,chc,resid(iband)
     call wrtout(06,message,'COLL')
    end if

    !======================================================================
    !============== CHECK FOR CONVERGENCE CRITERIA ========================
    !======================================================================

    ! If residual sufficiently small stop line minimizations
    if (resid(iband)<tolwfr) then
     if(prtvol>=10)then
      write(message, '(a,i4,a,i2,a,es12.4)' ) ' cgwf: band',iband,' converged after '&
      & ,iline, ' line minimizations : resid =',resid(iband)
      call wrtout(06,message,'COLL')
     end if
     ! Number of two-way 3D ffts skipped
     nskip=nskip+(nline-iline+1)
     ! Exit from the loop on iline
     exit
    end if

    ! If user require exiting the job, stop line minimisations
    if (quit==1) then
     write(message, '(a,i4)' ) ' cgwf : user require exiting => skip update of band ',iband
     call wrtout(06,message,'COLL')
     ! Number of two-way 3D ffts skipped
     nskip=nskip+(nline-iline+1)
     ! Exit from the loop on iline
     exit
    end if

    !======================================================================
    !=========== COMPUTE THE STEEPEST DESCENT DIRECTION ===================
    !======================================================================

    ! Store <G|H|C> in direc
    call ZCOPY(gnpw, ghc, 1, direc, 1)

    !======================================================================
    !=========== PROJECT THE STEEPEST DESCENT DIRECTION ===================
    !========= OVER THE SUBSPACE ORTHOGONAL TO OTHER BANDS ================
    !======================================================================

    ! The following projection over the subspace orthogonal to occupied bands
    ! is optional. It is a bit more accurate, but doubles the number of N^3 ops.
    ! It is done only if ortalg>=0.

    !  Project the steepest descent direction:
    !  <direc|=<direc| - \sum_{i} <cg_i|direc>|cg_i>
    !  if all cg_i are orthogonal
    do iiband=1,nband
     index=npw*(iiband-1)+icg+gmin-1 ! index correction (G-parallelism)
     zdotarr(iiband) = ZDOTC(gnpw, cg(1,index+1), 1, direc, 1)
    end do
#if defined MPI && defined MPI_EXT
    call MPI_ALLREDUCE(MPI_IN_PLACE, zdotarr, 2*nband, MPI_DOUBLE_PRECISION, MPI_SUM, &
    &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif
    do iiband=1,nband
     index=npw*(iiband-1)+icg+gmin-1 ! index correction (G-parallelism)
     call ZAXPY(gnpw, -1*zdotarr(iiband), cg(1,index+1), 1, direc, 1)
    end do

    !======================================================================
    !======== PRECONDITION THE STEEPEST DESCENT DIRECTION =================
    !======================================================================

    if(prtvol<0) call status(iline,filstat,iexit,level,'call precon   ')
    !call precon(cwavef,zero,istwf_k,kinpw,mpi_enreg,npw,nspinor,optekin,pcon,direc)
    !Compute mean kinetic energy of band
    ek0=zero
    do ipw=1,gnpw
     jpw=ipw+gmin-1 ! index correction (G-parallelism)
     if(kinpw(ipw)<huge(0.0_dp)*1.d-11)then ! this is redundant (I guess)
      ek0=ek0+kinpw(jpw)*(cwavef(1,ipw)**2+cwavef(2,ipw)**2)
     end if
    end do
#if defined MPI && defined MPI_EXT
    call MPI_ALLREDUCE(MPI_IN_PLACE, ek0, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
    &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif

    if(ek0<1.0d-10)then
     write(message, '(a,a,a,a,a,a)' )ch10,&
     &  ' precon : WARNING -',ch10,&
     &  '  The mean kinetic energy of a wavefunction vanishes.',ch10,&
     &  '  It is reset to 0.1Ha.'
     call wrtout(6,message,'COLL')
     ek0=0.1_dp
    end if

    ek0_inv=1.0_dp/ek0

    do ipw=1,gnpw
     jpw=ipw+gmin-1 ! index correction (G-parallelism)
     if(kinpw(jpw)<huge(0.0_dp)*1.d-11)then
      xx=kinpw(jpw)*ek0_inv
      ! Teter polynomial ratio
      poly=27._dp+xx*(18._dp+xx*(12._dp+xx*8._dp))
      fac=poly/(poly+16._dp*xx**4)
      direc(1,ipw)=direc(1,ipw)*fac
      direc(2,ipw)=direc(2,ipw)*fac
     else
      direc(1,ipw)=zero
      direc(2,ipw)=zero
     end if
    end do
    !!!!! /precon

    !======================================================================
    !======= PROJECT THE PRECOND. STEEPEST DESCENT DIRECTION ==============
    !========= OVER THE SUBSPACE ORTHOGONAL TO OTHER BANDS ================
    !======================================================================
    if(prtvol<0) call status(0,filstat,iexit,level,'prjbd(2)      ')
    ! Projecting again out all bands (not normalized).
    ! htor: can be done as:
    !  <direc| = \sum_{i} <direc| - <cg_i|direc>|cg_i>
    !  if all cg_i are orthogonal
    ! TODO: implement this as a blas call?
    do iiband=1,nband
     index=npw*(iiband-1)+icg+gmin-1 ! index correction (G-parallelism)
     zdotarr(iiband) = ZDOTC(gnpw, cg(1,index+1), 1, direc, 1)
    end do
#if defined MPI && defined MPI_EXT
    call MPI_ALLREDUCE(MPI_IN_PLACE, zdotarr, 2*nband, MPI_DOUBLE_PRECISION, MPI_SUM, &
    &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif
    do iiband=1,nband
     index=npw*(iiband-1)+icg+gmin-1 ! index correction (G-parallelism)
     call ZAXPY(gnpw, -1*zdotarr(iiband), cg(1,index+1), 1, direc, 1)
    end do

    !======================================================================
    !================= COMPUTE THE CONJUGATED GRADIENT ====================
    !======================================================================

    ! call dotprod_g(dotgg,doti,istwf_k,mpi_enreg,npw*nspinor,1,direc,ghc)
    ! we need only the real part
    dotgg = DDOT(2*gnpw, direc, 1, ghc, 1)
#if defined MPI && defined MPI_EXT
    call MPI_ALLREDUCE(MPI_IN_PLACE, dotgg, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
    &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif

    ! At first iteration, dotgg/dotgp is zero
    if (iline==1) then
     call ZCOPY(gnpw, direc, 1, conjgr, 1)
    else
     call ZAXPY(gnpw, dcmplx(dotgg/dotgp), conjgr, 1, direc, 1)
     call ZCOPY(gnpw, direc, 1, conjgr, 1)
    end if
    dotgp=dotgg

    !======================================================================
    !============ PROJECTION OF THE CONJUGATED GRADIENT ===================
    !======================================================================

    ! call dotprod_g(dotr,doti,istwf_k,mpi_enreg,npw*nspinor,3,cwavef,conjgr)
    ! TODO ZDOTC returns other values for doti ... but this does not matter?????
    zdotures = ZDOTC(gnpw, conjgr, 1, cwavef, 1)
#if defined MPI && defined MPI_EXT
    call MPI_ALLREDUCE(MPI_IN_PLACE, zdotures, 2, MPI_DOUBLE_PRECISION, MPI_SUM, &
    &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif

    ! Project the conjugated gradient onto the current band
    ! TODO: new blas call y_ = a*x_ + z_
    call ZAXPY(gnpw, -1.0_dp*zdotures, cwavef, 1, conjgr, 1)
    call ZCOPY(gnpw, conjgr, 1, direc, 1)

    ! normalization of direction vector
    ! call sqnorm_g(dotr,istwf_k,mpi_enreg,npw*nspinor,direc)
    ! dotr = <direc|direc> (= real)
    ! we need only the real part
    dotr = DDOT(2*gnpw, direc, 1, direc, 1)
#if defined MPI && defined MPI_EXT
    call MPI_ALLREDUCE(MPI_IN_PLACE, dotr, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
    &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif
    ! normalize direc
    call ZDSCAL(gnpw, 1._dp/sqrt(abs(dotr)), direc, 1)

    !======================================================================
    !===== COMPUTE CONTRIBUTIONS TO 1ST AND 2ND DERIVATIVES OF ENERGY =====
    !======================================================================

    ! Compute gh_direc = <G|H|D>
    if(fft==0) then
#if defined MPI
     call MPI_ALLGATHER(direc, 2*gmpw, MPI_DOUBLE_PRECISION, direc, 2*gmpw, &
     &                  MPI_DOUBLE_PRECISION, mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif
     call sg_fftrisc(1,vlocal,direc,gh_direc,work,gs_hamk%gbound,gs_hamk%gbound,&
     &    gs_hamk%istwf_k,kg_k,kg_k,mgfft,gs_hamk%ngfft,npw,npw,n4,n5,n6,2,1.0_dp)
#if defined MPI
     call ZCOPY(gnpw, direc(1,gmin), 1, direc, 1)
     call ZCOPY(gnpw, gh_direc(1,gmin), 1, gh_direc, 1)
#endif
    else
     call sg_fftrisc_htor(1,vlocal,direc,gh_direc,work,gs_hamk%gbound,gs_hamk%gbound,&
     &    gs_hamk%istwf_k,kg_k,kg_k,mgfft,gs_hamk%ngfft,npw,npw,n4,n5,n6,2,1.0_dp,mpi_enreg,&
&         maxgbin,maxgbout,gnpw,blkgb,packin,unpackin,packout,unpackout)
    end if

    signs=2 ; choice=1 ; nnlout=1 ; idir=0 ; nkpg=0
    paw_opt=gs_hamk%usepaw
    call nonlop_htor(choice,gs_hamk%dimekb1,gs_hamk%dimekb2,dimffnl,dimffnl,&
    &   gs_hamk%ekb,enlout,ffnl,ffnl,gs_hamk%gmet,gs_hamk%gprimd,idir,&
    &   gs_hamk%indlmn,gs_hamk%istwf_k,kg_k,kg_k,kpg_dum,kpg_dum,gs_hamk%kpoint,gs_hamk%kpoint,&
    &   lmnmax,matblk,mgfft,mpi_enreg,mpsang,mpssoang,natom,gs_hamk%nattyp,&
    &   gs_hamk%ngfft,nkpg,nkpg,gs_hamk%nloalg,nnlout,npw,npw,nspinor,ntypat,&
    &   gs_hamk%phkxred,gs_hamk%phkxred,gs_hamk%ph1d,ph3d,ph3d,gs_hamk%pspso,&
    &   signs,gs_hamk%ucvol,direc,gvnl_direc)

    ! sum the potentials up
    do ipw=1,gnpw
     jpw=ipw+gmin-1
     if(kinpw(ipw)<huge(0.0_dp)*1.d-11)then
      gh_direc(1,ipw)=kinpw(jpw)*direc(1,ipw)+gh_direc(1,ipw)+gvnl_direc(1,ipw)
      gh_direc(2,ipw)=kinpw(jpw)*direc(2,ipw)+gh_direc(2,ipw)+gvnl_direc(2,ipw)
     else
      gh_direc(1,ipw)=0.0_dp
      gh_direc(2,ipw)=0.0_dp
     end if
    end do ! ipw

    if(prtvol<0) call status(iline,filstat,iexit,level,'after getghc  ')

    ! Compute dhc = Re{<D|H|C>}
    ! call dotprod_g(dhc,doti,istwf_k,mpi_enreg,npw*nspinor,1,direc,ghc)
    ! we need only the real part
    dhc = DDOT(2*gnpw, direc, 1, ghc, 1)
#if defined MPI && defined MPI_EXT
    call MPI_ALLREDUCE(MPI_IN_PLACE, dhc, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
    &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif

    ! Compute <D|H|D> or <D|(H-zshift)^2|D>
    ! call dotprod_g(dhd,doti,istwf_k,mpi_enreg,npw*nspinor,1,direc,gh_direc)
    dhd = DDOT(2*gnpw, direc, 1, gh_direc, 1)
#if defined MPI && defined MPI_EXT
    call MPI_ALLREDUCE(MPI_IN_PLACE, dhd, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
    &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif

    if(prtvol==-level)then
     write(message,'(a,3f14.6)') 'cgwf : chc,dhc,dhd=',chc,dhc,dhd
     call wrtout(06,message,'COLL')
    end if

    !======================================================================
    !======= COMPUTE MIXING FACTORS - CHECK FOR CONVERGENCE ===============
    !======================================================================

    ! Compute tan(2 theta),sin(theta) and cos(theta)
    tan2th=2.0_dp*dhc/(chc-dhd)

    if (abs(tan2th)<1.d-05) then

     costh=1.0_dp-0.125_dp*tan2th**2
     sinth=0.5_dp*tan2th*(1.0_dp-0.375_dp*tan2th**2)

     ! Check that result is above machine precision
     if (abs(sinth)<epsilon(0._dp)) then
      write(message, '(a,es16.4)' ) ' cgwf: converged with tan2th=',tan2th
      call wrtout(06,message,'COLL')
      ! Number of one-way 3D ffts skipped
      nskip=nskip+2*(nline-iline)
      ! Exit from the loop on iline
      exit
     end if
    else
     root=sqrt(1.0_dp+tan2th**2)
     costh=sqrt(0.5_dp+0.5_dp/root)
     sinth=sign(sqrt(0.5_dp-0.5_dp/root),tan2th)
    end if

    ! Check for lower of two possible roots (same sign as curvature at theta where slope is zero)
    diff=(chc-dhd)
    ! Swap c and d if value of diff is positive
    if (diff>zero) then
     swap=costh
     costh=-sinth
     sinth=swap
     if(prtvol<0 .or. prtvol>=10)then
      write(message,*)'   Note: swap roots, iline,diff=',iline,diff
      call wrtout(06,message,'COLL')
     end if
    end if


    !======================================================================
    !=========== GENERATE NEW H|wf>, Vnl|Wf>, S|Wf> ... ===================
    !======================================================================

    ! TODO: one blas call ...
    call ZDSCAL(gnpw, costh, cwavef, 1)
    call ZAXPY(gnpw, dcmplx(sinth), direc, 1, cwavef, 1)
    call ZCOPY(gnpw, cwavef, 1, cg(1,npw*(iband-1)+icg+gmin), 1)

    ! TODO: one blas call ...
    call ZDSCAL(gnpw, costh, ghc, 1)
    call ZAXPY(gnpw, dcmplx(sinth), gh_direc, 1, ghc, 1)

    ! TODO: one blas call ...
    call ZDSCAL(gnpw, costh, gvnlc, 1)
    call ZAXPY(gnpw, dcmplx(sinth), gvnl_direc, 1, gvnlc, 1)

    !======================================================================
    !=========== CHECK CONVERGENCE AGAINST TRIAL ENERGY ===================
    !======================================================================

    deltae=chc*(costh**2-1._dp)+dhd*sinth**2+2._dp*costh*sinth*dhc


    ! Check convergence and eventually exit
    if (iline==1) then
     deold=deltae
    else if (abs(deltae)<0.005_dp*abs(deold) .and. iline/=nline)then
     if(prtvol>=10)then
      write(message, '(a,i4,1x,a,1p,e12.4,a,e12.4,a)' ) ' cgwf: line',iline,' deltae=',deltae,' < 0.005*',deold,' =>skip lines'
      call wrtout(06,message,'COLL')
     end if
     ! Number of one-way 3D ffts skipped
     nskip=nskip+2*(nline-iline)
     ! Exit from the loop on iline
     exit
    end if

    !======================================================================
    !================ END LOOP FOR A GIVEN BAND ===========================
    !======================================================================

    !     Note that there are three "exit" instructions inside
   end do
  else
   ! nline==0 , needs to provide a residual
   ! ===================================================
   resid(iband)=-one
   ! End nline==0 case
  end if

  !======================================================================
  !=============== END OF CURRENT BAND: CLEANING ========================
  !======================================================================

  if(prtvol<0) call status(0,filstat,iexit,level,'after iline   ')

  !   Structured debugging : if prtvol=-level, stop here.
  if(prtvol==-level)then
   write(message,'(a1,a,a1,a,i1,a)') ch10,' cgwf : exit ',ch10,'  prtvol=-',level,', debugging mode => stop '
   call wrtout(06,message,'COLL')
   call leave_new('COLL')
  end if


  !======================================================================
  !============= COMPUTE HAMILTONIAN IN WFs SUBSPACE ====================
  !======================================================================

  !Loop over already optimized bands in a block
  index=1
  do ii=1,iband
   ! chccmplx
   zdotarr(index) = ZDOTC(gnpw, cg(1,npw*(ii-1)+icg+gmin), 1, ghc, 1)
   ! cvccmplx
   zdotarr(index+1) = ZDOTC(gnpw, cg(1,npw*(ii-1)+icg+gmin), 1, gvnlc, 1)
   index=index+2
  end do
#if defined MPI && defined MPI_EXT
  call MPI_ALLREDUCE(MPI_IN_PLACE, zdotarr, 2*iband, MPI_DOUBLE_COMPLEX, MPI_SUM, &
  &                  mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif
  index=1
  do ii=1,iband
   ! Store real and imag parts in Hermitian storage mode (packed):
   subvnl(isubh  )=real(zdotarr(index+1))
   subvnl(isubh+1)=aimag(zdotarr(index+1))
   subham(isubh  )=real(zdotarr(index))
   subham(isubh+1)=aimag(zdotarr(index))
   isubh=isubh+2
   index=index+2
  end do
 end do ! iband in a block
 ! End big iband loop
 !===========================================================================

 ! gather the cg-array to all processors!
#ifdef MPI
 do iband=1,nband
  ! we have to copy twice because an allgather on cg would overwrite data if
  ! gmpw*nprocs is not exactly npw
  call ZCOPY(gnpw, cg(1,npw*(iband-1)+icg+gmin), 1, cgbuf, 1)
  call MPI_ALLGATHER(cgbuf, 2*gmpw, MPI_DOUBLE_PRECISION, cgbuf, 2*gmpw, &
  &                  MPI_DOUBLE_PRECISION, mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
  call ZCOPY(npw, cgbuf, 1, cg(1,npw*(iband-1)+icg+1), 1)
 end do
#endif

!Debugging ouputs
if(prtvol==-level)then
 isubh=1
 write(message,'(a)') ' cgwf : isubh  subham(isubh:isubh+1)  subvnl(isubh:isubh+1)'
 do iband=1,nband
  do ii=1,iband
   write(message,'(i5,4es16.6)')isubh,subham(isubh:isubh+1),subvnl(isubh:isubh+1)
   call wrtout(06,message,'COLL')
   isubh=isubh+2
  end do
 end do
end if

#if defined MPI
           tend=MPI_Wtime()
#endif

!======================================================================
!==================== FINAL DEALLOCATIONS =============================
!======================================================================

deallocate(conjgr,cwavef,direc,scprod)
deallocate(vresid,gh_direc,gvnl_direc);
deallocate(ghc,gvnlc,zdotarr,cgbuf,packin,packout,unpackin,unpackout)

if(prtvol<0) call status(0,filstat,iexit,level,'exit          ')
call timab(22,2,tsec)

if(mpi_enreg%gindex==0) write(*,'(a,f7.2,a,i3)') '@@@ cgwf_htor took ', tend-tstart, ' seconds for group ', mpi_enreg%ggroup

end subroutine cgwf_htor
!!***

Generated by  Doxygen 1.6.0   Back to index