!///////////////////////////////////////////////////////////////////////
!
!      Author:          M. Shiga
!      Last updated:    Nov 10, 2018 by M. Shiga
!      Description:     particle mesh Ewald
!
!///////////////////////////////////////////////////////////////////////

#ifndef nopme

!***********************************************************************
      subroutine force_pmeewald_setup_MPI
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use mm_variables, only : ncharge

      use mm_variables, only : Nfft, nfft_in, Bsp_order, ScRs, &
     &   BthetaX, BthetaY, BthetaZ, dBthetaX, dBthetaY, dBthetaZ, &
     &   BsplineModuleX, BsplineModuleY, BsplineModuleZ

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      Nfft(:) = nfft_in(:)

      Bsp_order = 4

      if ( .not. allocated( ScRs ) ) &
     &   allocate( ScRs(3,ncharge) )
      if ( .not. allocated( BthetaX ) ) &
     &   allocate( BthetaX(Bsp_order,ncharge) )
      if ( .not. allocated( BthetaY ) ) &
     &   allocate( BthetaY(Bsp_order,ncharge ) )
      if ( .not. allocated( BthetaZ ) ) &
     &   allocate( BthetaZ(Bsp_order,ncharge ) )
      if ( .not. allocated( dBthetaX ) ) &
     &   allocate( dBthetaX(Bsp_order,ncharge) )
      if ( .not. allocated( dBthetaY ) ) &
     &   allocate( dBthetaY(Bsp_order,ncharge) )
      if ( .not. allocated( dBthetaZ ) ) &
     &   allocate( dBthetaZ(Bsp_order,ncharge) )
      if ( .not. allocated( BsplineModuleX ) ) &
     &   allocate( BsplineModuleX(Nfft(1)) )
      if ( .not. allocated( BsplineModuleY ) ) &
     &   allocate( BsplineModuleY(Nfft(2)) )
      if ( .not. allocated( BsplineModuleZ ) ) &
     &   allocate( BsplineModuleZ(Nfft(3)) )

      call PME_GetSize
      call PME_setup

      return
      end





!***********************************************************************
      subroutine PME_GetSize
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : nprocs_sub

      use mm_variables, only : ncharge

      use mm_variables, only : &
     &   Nfft, Nfftdim, Bsp_order, SizeBtheta, SizeGridQ, MaxGrid

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      integer :: Sizeheap
      integer :: Sizestack
      integer :: SizeFFTtable
      integer :: SizeFFTwork

      integer :: ii, jj

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      call PME_Get_FFTdimension( SizeFFTtable, SizeFFTwork )

      SizeBtheta = ncharge * Bsp_order
      SizeGridQ  = 2 * Nfftdim(1) * Nfftdim(2) * Nfftdim(3)
      Sizeheap   = Nfft(1) + Nfft(2) + Nfft(3) + SizeFFTtable
      Sizestack  = SizeGridQ + 6 * SizeBtheta + SizeFFTwork + 3*ncharge

      if ( nprocs_sub /= 1 ) then

         if ( Nfftdim(1) >= Nfftdim(3) ) then

            ii = Nfftdim(1) / nprocs_sub
            jj = mod( Nfftdim(1), nprocs_sub )
            if ( jj /= 0 ) ii = ii + 1

            MaxGrid = ii * Nfftdim(2) * Nfftdim(3) * 2

         else

            ii = Nfftdim(3) / nprocs_sub
            jj = mod( Nfftdim(3), nprocs_sub )
            if ( jj /= 0 ) ii = ii + 1

            MaxGrid = ii * Nfftdim(2) * Nfftdim(1) * 2

         end if

      else

         MaxGrid = SizeGridQ

      end if

      return
      end





!***********************************************************************
      subroutine PME_setup
!***********************************************************************

      implicit none

      call PME_Load_Bspline_moduli

      call PME_FFTsetup

      call PME_Prep_Atom_to_Mesh ! parallel

      return
      end




!***********************************************************************
      subroutine force_pmeewald_fs_MPI
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : pot, vir, fx, fy, fz, nbead, &
     &   myrank_main, nprocs_main

      use mm_variables, only : SizeGridQ

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      real(8), dimension(SizeGridQ) :: gridQ

      integer :: m

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      do m = 1, nbead

         if ( mod( m-1, nprocs_main ) .ne. myrank_main ) cycle

         call PME_scaled_coord (m)

         call PME_bspline_coeffs

         call PME_charge_grid (gridQ)

         call PME_FFT_back (gridQ)

         call PME_calc_energy (gridQ, m, pot, vir)

         call PME_FFT_forward (gridQ)

         call PME_calc_force (gridQ, m, fx, fy, fz)

      end do

      return
      end





!***********************************************************************
      subroutine PME_scaled_coord( m )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use mm_variables, only : i_q, ncharge

      use common_variables, only : &
     &   x, y, z, boxinv, myrank_sub, nprocs_sub

      use mm_variables, only : ScRs, Nfft

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      integer :: i, k, m, Nas

      real(8) :: Sx, Sy, Sz, Rx, Ry, Rz
      real(8) :: ax, ay, az, bx, by, bz, cx, cy, cz

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      Nas = nprocs_sub - myrank_sub

      ax = boxinv(1,1)
      ay = boxinv(1,2)
      az = boxinv(1,3)
      bx = boxinv(2,1)
      by = boxinv(2,2)
      bz = boxinv(2,3)
      cx = boxinv(3,1)
      cy = boxinv(3,2)
      cz = boxinv(3,3)

      do k = nas, ncharge, nprocs_sub

         i  = i_q(k)

         Rx = x(i,m)
         Ry = y(i,m)
         Rz = z(i,m)

         Sx = ax*Rx + ay*Ry + az*Rz
         Sy = bx*Rx + by*Ry + bz*Rz
         Sz = cx*Rx + cy*Ry + cz*Rz

         if(Sx >  0.5d0) Sx = Sx - 1.0d0
         if(Sx < -0.5d0) Sx = Sx + 1.0d0
         if(Sy >  0.5d0) Sy = Sy - 1.0d0

         if(Sy < -0.5d0) Sy = Sy + 1.0d0
         if(Sz >  0.5d0) Sz = Sz - 1.0d0
         if(Sz < -0.5d0) Sz = Sz + 1.0d0

         Sx = Sx + 0.5d0
         Sy = Sy + 0.5d0
         Sz = Sz + 0.5d0

         ScRs(1,i) = Nfft(1) * Sx
         ScRs(2,i) = Nfft(2) * Sy
         ScRs(3,i) = Nfft(3) * Sz

      end do

      return
      end





!***********************************************************************
      subroutine PME_Load_Bspline_moduli
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : myrank

      use mm_variables, only : &
     &   Nfft, Bsp_order, BsplineModuleX, BsplineModuleY, BsplineModuleZ

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      integer, parameter :: MAXORDER = 25
      integer, parameter :: MAXNFFT = 1000

      real(8), dimension(MAXORDER) :: array
      real(8), dimension(MAXORDER) :: darray
      real(8) :: w
      real(8), dimension(MAXNFFT) :: bsp_arr

      integer :: i, maxn, ierr

!-----------------------------------------------------------------------
!     /*   check                                                      */
!-----------------------------------------------------------------------

      ierr = 0

      if ( Bsp_order > MAXORDER ) then
         ierr = 1
         if ( myrank .eq. 0 ) &
     &      write( 6, '(a)' ) 'Error - MAXORDER too small.'
      end if

      maxn = max( Nfft(1), Nfft(2), Nfft(3) )

      if ( maxn > MAXNFFT ) then
         ierr = 1
         if ( myrank .eq. 0 ) &
     &      write( 6, '(a)' ) 'Error - MAXNFFT too small.'
      end if

      call error_handling_MPI &
     &   ( ierr, 'subroutine PME_Load_Bspline_moduli', 34 )

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      w = 0.d0

      call PME_fill_bspline( w, Bsp_order, array, darray )

      do i = 1, maxn
         bsp_arr(i) = 0.d0
      end do

      do i = 2, Bsp_order+1
         bsp_arr(i) = array(i-1)
      end do

      call PME_dFTModulus( BsplineModuleX, bsp_arr, Nfft(1) )
      call PME_dFTModulus( BsplineModuleY, bsp_arr, Nfft(2) )
      call PME_dFTModulus( BsplineModuleZ, bsp_arr, Nfft(3) )

      return
      end





!***********************************************************************
      subroutine PME_dFTModulus( bsp_mod, bsp_arr, Numfft )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : pi

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      integer :: Numfft

      real(8), dimension(Numfft) :: bsp_mod, bsp_arr

      integer :: i, j
      real(8) :: cst, snt, arg, tiny

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      tiny = 1.d-7

      do i = 1, Numfft

        cst = 0.d0
        snt = 0.d0

        do j = 1,Numfft

          arg = 2.0d0 * pi * (i-1) * (j-1) / Numfft
          cst = cst + bsp_arr(j) * cos(arg)
          snt = snt + bsp_arr(j) * sin(arg)

        end do

        bsp_mod(i) = cst*cst + snt*snt

      end do

      do i = 1, Numfft

         if ( bsp_mod(i) < tiny ) then
            bsp_mod(i) = 0.5d0 * ( bsp_mod(i-1) + bsp_mod(i+1) )
         end if

      end do

      return
      end





!***********************************************************************
      subroutine PME_charge_grid( gridQ )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : myrank_sub, nprocs_sub

      use mm_variables, only : q, i_q, ncharge

      use mm_variables, only : &
     &   Bsp_order, Nfft, ScRs, BthetaX, BthetaY, BthetaZ, NfftDim

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      integer :: l, ith1, ith2, ith3, i, j, k, ii, jj, kk, i2, Nas

      real(8), dimension(2,NfftDim(1),NfftDim(2),NfftDim(3)) :: gridQ
      real(8), dimension(NfftDim(1),NfftDim(2),NfftDim(3)) :: Q2

      real(8) :: fct0, fct1, fct2

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      Nas = nprocs_sub - myrank_sub

      Q2 = 0.d0

      do l = Nas, ncharge, nprocs_sub

         i2  = i_q(l)
         fct0 = q(i2)

         kk = int(ScRs(3,l)) - Bsp_order

         do ith3 = 1 , Bsp_order

            kk = kk + 1
            k  = kk + 1 + (Nfft(3) - isign(Nfft(3),kk))/2

            fct1 = fct0 * BthetaZ(ith3,l)

            jj = int(ScRs(2,l)) - Bsp_order

            do ith2 = 1 , Bsp_order

               jj = jj + 1
               j  = jj + 1 + (Nfft(2) - isign(Nfft(2),jj))/2

               fct2 = BthetaY(ith2,l) * fct1

               ii = int(ScRs(1,l)) - Bsp_order

               do ith1 = 1 , Bsp_order

                  ii = ii + 1
                  i  = ii + 1 + (Nfft(1) - isign(Nfft(1),ii))/2

                  Q2(i,j,k) = Q2(i,j,k) + BthetaX(ith1,l) * fct2

               end do

            end do

         end do

      end do

      if ( nprocs_sub /= 1 ) call PME_SumChargeDens( Q2 )

      do i = 1, NfftDim(1)
      do j = 1, NfftDim(2)
      do k = 1, NfftDim(3)

         gridQ(1,i,j,k) = Q2(i,j,k)
         gridQ(2,i,j,k) = 0.d0

      end do
      end do
      end do

      return
      end





!***********************************************************************
      subroutine PME_calc_energy( gridQ, m, pot, vir )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : pi, boxinv, volume, nbead, &
     &   myrank_sub, nprocs_sub

      use mm_variables, only : alpha_ewald

      use mm_variables, only : Nfft, BsplineModuleX, BsplineModuleY,  &
     &    BsplineModuleZ, Nfftdim

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      integer :: k1, k2, k3, Nff, ini, m, ihx, ihy, ihz, Nfx, Nfy, Nfz
      integer :: Nas

      real(8), dimension(2,NfftDim(1),NfftDim(2),NfftDim(3)) :: gridQ
      real(8), dimension(2,NfftDim(1),NfftDim(2),NfftDim(3)) :: gridQ2

      real(8), dimension(nbead) :: pot
      real(8), dimension(3,3)   :: vir

      real(8) :: pref, denom, eterm, vterm, energy, struc2
      real(8) :: kn2, Bspx, Bspy
      real(8) :: IHxx, IHxy, IHxz, IHyx, IHyy, IHyz, IHzx, IHzy, IHzz
      real(8) :: Vxx, Vxy, Vxz, Vyy, Vyz, Vzz
      real(8) :: knx, kny, knz, est2, vtmx, vtmy, vtmz

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      Nas = nprocs_sub - myrank_sub

      IHxx = boxinv(1,1)
      IHxy = boxinv(1,2)
      IHxz = boxinv(1,3)
      IHyx = boxinv(2,1)
      IHyy = boxinv(2,2)
      IHyz = boxinv(2,3)
      IHzx = boxinv(3,1)
      IHzy = boxinv(3,2)
      IHzz = boxinv(3,3)

      Nff = Nfft(1) * Nfft(2)
      pref = - (pi*pi) / (alpha_ewald*alpha_ewald)

      Nfx = Nfft(1) / 2
      if ( 2*Nfx < Nfft(1) ) Nfx = Nfx + 1

      Nfy = Nfft(2) / 2
      if ( 2*Nfy < Nfft(2) ) Nfy = Nfy + 1

      Nfz = Nfft(3) / 2
      if ( 2*Nfz < Nfft(3) ) Nfz = Nfz + 1

      energy = 0.d0

      Vxx = 0.d0
      Vxy = 0.d0
      Vxz = 0.d0
      Vyy = 0.d0
      Vyz = 0.d0
      Vzz = 0.d0

      do k1 = Nas, Nfft(1), nprocs_sub

         Bspx = BsplineModuleX(k1)

         do k2 = 1, Nfft(2)

            Bspy = BsplineModuleY(k2)

            ini = 1
            if( k1==1 .and. k2==1 ) ini = 2

            do k3 = ini, Nfft(3)

               ihx = k1 - 1
               if ( k1 > Nfx ) ihx = k1 - 1 - Nfft(1)

               ihy = k2 - 1
               if ( k2 > Nfy ) ihy = k2 - 1 - Nfft(2)

               ihz = k3 - 1
               if ( k3 > Nfz ) ihz = k3 - 1 - Nfft(3)

               knx = IHxx*ihx + IHyx*ihy + IHzx*ihz
               kny = IHxy*ihx + IHyy*ihy + IHzy*ihz
               knz = IHxz*ihx + IHyz*ihy + IHzz*ihz

               kn2 = knx*knx + kny*kny + knz*knz

               denom = pi*volume * Bspx*Bspy * BsplineModuleZ(k3) * kn2

               eterm = exp( pref * kn2 ) / denom

               vterm = 2.d0 * ( -pref * kn2 + 1.d0 ) / kn2

               struc2 = gridQ(1,k1,k2,k3) * gridQ(1,k1,k2,k3) &
     &                + gridQ(2,k1,k2,k3) * gridQ(2,k1,k2,k3)

               gridQ(1,k1,k2,k3) = eterm * gridQ(1,k1,k2,k3)
               gridQ(2,k1,k2,k3) = eterm * gridQ(2,k1,k2,k3)

               gridQ2(1,k1,k2,k3) = gridQ(1,k1,k2,k3)
               gridQ2(2,k1,k2,k3) = gridQ(2,k1,k2,k3)

               energy = energy + eterm * struc2

               est2 = eterm * struc2

               vtmx = vterm * knx
               vtmy = vterm * kny
               vtmz = vterm * knz

               Vxx = Vxx + est2 * (vtmx * knx - 1.d0)
               Vxy = Vxy + est2 * vtmx * kny
               Vxz = Vxz + est2 * vtmx * knz
               Vyy = Vyy + est2 * (vtmy * kny - 1.d0)
               Vyz = Vyz + est2 * vtmy * knz
               Vzz = Vzz + est2 * (vtmz * knz - 1.d0)

            end do

         end do

      end do

      pot(m) = pot(m) + 0.5d0 * energy

      Vxx = - 0.50 * Vxx
      Vxy = - 0.50 * Vxy
      Vxz = - 0.50 * Vxz
      Vyy = - 0.50 * Vyy
      Vyz = - 0.50 * Vyz
      Vzz = - 0.50 * Vzz

      vir(1,1) = vir(1,1) + Vxx
      vir(1,2) = vir(1,2) + Vxy
      vir(1,3) = vir(1,3) + Vxz
      vir(2,1) = vir(2,1) + Vxy
      vir(2,2) = vir(2,2) + Vyy
      vir(2,3) = vir(2,3) + Vyz
      vir(3,1) = vir(3,1) + Vxz
      vir(3,2) = vir(3,2) + Vyz
      vir(3,3) = vir(3,3) + Vzz

      return
      end





!***********************************************************************
      subroutine PME_calc_force( gridQ, m, fx, fy, fz )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : boxinv, nbead, natom, &
     &   myrank_sub, nprocs_sub

      use mm_variables, only : Nfft, Nfftdim, ScRs, Bsp_order, &
     &    BthetaX, BthetaY, BthetaZ, dBthetaX, dBthetaY, dBthetaZ

      use mm_variables, only : q, i_q, ncharge

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      real(8), dimension(2,NfftDim(1),NfftDim(2),NfftDim(3)) :: gridQ
      real(8), dimension(NfftDim(1),NfftDim(2),NfftDim(3)) :: Q2
      real(8), dimension(natom,nbead) :: fx, fy, fz

      integer :: l, ll, ith1, ith2, ith3, ii, jj, kk, i, j, k, Nas, m

      real(8) :: term, zz, dBx, dBy, dBz, Bx, By, Bz, fcx, fcy, fcz
      real(8) :: IHxx, IHxy, IHxz, IHyx, IHyy, IHyz, IHzx, IHzy, IHzz

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      Nas = nprocs_sub - myrank_sub

      IHxx = boxinv(1,1)
      IHxy = boxinv(1,2)
      IHxz = boxinv(1,3)
      IHyx = boxinv(2,1)
      IHyy = boxinv(2,2)
      IHyz = boxinv(2,3)
      IHzx = boxinv(3,1)
      IHzy = boxinv(3,2)
      IHzz = boxinv(3,3)

      do k = Nas, NfftDim(3), nprocs_sub
      do j = 1, NfftDim(2)
      do i = 1, NfftDim(1)

            Q2(i,j,k) = gridQ(1,i,j,k)

      end do
      end do
      end do

      if( nprocs_sub /= 1 ) call PME_DistChargeDens( Q2 )

      do l = Nas, ncharge, nprocs_sub

         ll = i_q(l)
         zz = q(ll)

         fcx = 0.d0
         fcy = 0.d0
         fcz = 0.d0

         kk = int(ScRs(3,l)) - Bsp_order

         do ith3 = 1,Bsp_order

            kk = kk + 1
            k = kk + 1 + (Nfft(3) - isign(Nfft(3),kk))/2
            jj = int(ScRs(2,l)) - Bsp_order
            dBz = dBthetaZ(ith3,l)
            Bz  =  BthetaZ(ith3,l)

            do ith2 = 1,Bsp_order

               jj = jj + 1
               j = jj + 1 + (Nfft(2) - isign(Nfft(2),jj))/2
               ii = int(ScRs(1,l)) - Bsp_order
               dBy = dBthetaY(ith2,l)
               By  =  BthetaY(ith2,l)

               do ith1 = 1,Bsp_order

                  ii = ii + 1
                  i = ii + 1 + (Nfft(1) - isign(Nfft(1),ii))/2
                  term = zz * Q2(i,j,k)
                  dBx = dBthetaX(ith1,l)
                  Bx  =  BthetaX(ith1,l)

                  fcx = fcx - Nfft(1) * term * dBx *  By *  Bz
                  fcy = fcy - Nfft(2) * term *  Bx * dBy *  Bz
                  fcz = fcz - Nfft(3) * term *  Bx *  By * dBz

               end do

            end do

         end do

         fx(ll,m) = fx(ll,m) + IHxx*fcx + IHyx*fcy + IHzx*fcz
         fy(ll,m) = fy(ll,m) + IHxy*fcx + IHyy*fcy + IHzy*fcz
         fz(ll,m) = fz(ll,m) + IHxz*fcx + IHyz*fcy + IHzz*fcz

      end do

      return
      end





!***********************************************************************
      subroutine PME_bspline_coeffs
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : myrank_sub, nprocs_sub

      use mm_variables, only : ncharge

      use mm_variables, only : &
     &   ScRs, Bsp_order, BthetaX, BthetaY, BthetaZ, dBthetaX, &
     &   dBthetaY, dBthetaZ

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      real(8) :: x, y, z

      integer :: i, Nas

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      Nas = nprocs_sub - myrank_sub

      do i = Nas, ncharge, nprocs_sub

         x = ScRs(1,i) - int(ScRs(1,i))
         y = ScRs(2,i) - int(ScRs(2,i))
         z = ScRs(3,i) - int(ScRs(3,i))

         call PME_fill_bspline &
     &      ( x, Bsp_order, BthetaX(1,i), dBthetaX(1,i) )
         call PME_fill_bspline &
     &      ( y, Bsp_order, BthetaY(1,i), dBthetaY(1,i) )
         call PME_fill_bspline &
     &      ( z, Bsp_order, BthetaZ(1,i), dBthetaZ(1,i) )

      end do

      return
      end





!***********************************************************************
      subroutine PME_fill_bspline( w, order, array, darray )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      integer :: order, i

      real(8) :: w

      real(8), dimension(order) :: array, darray

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

!     //   linear case

      call PME_init( array, w, order )

!     //   compute standard b-spline recursion

      do i = 3, order-1

         call PME_one_pass( array, w, i )

      end do

!     //   perform standard b-spline differentiation

      call PME_diff( array, darray, order )

!     //   one more recursion

      call PME_one_pass( array, w, order )

      return
      end





!***********************************************************************
      subroutine PME_init( c, x, order )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      integer :: order

      real(8), dimension(order) :: c

      real(8) :: x

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      c(order) = 0.d0
      c(2) = x
      c(1) = 1.d0 - x

      return
      end





!***********************************************************************
      subroutine PME_one_pass( c, x, i )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      real(8), dimension(*) :: c
      real(8) :: x, div

      integer :: i, j

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      div  = 1.d0 / (i-1)
      c(i) = div * x * c(i-1)

      do j = 1, i-2
        c(i-j) = div * ( (x+j) * c(i-j-1) + (i-j-x) * c(i-j) )
      end do

      c(1) = div * ( 1.d0 - x ) * c(1)

      return
      end





!***********************************************************************
      subroutine PME_diff( c, d, order )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      real(8), dimension(*) :: c, d

      integer :: order, j

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      d(1) = - c(1)

      do j = 2, order

        d(j) = c(j-1) - c(j)

      end do

      return
      end





!***********************************************************************
      subroutine PME_Get_FFTdimension( SizeFFTtable, SizeFFTwork )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use mm_variables

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      integer :: NfftMax
      integer, dimension(3) :: NLS
      integer :: SizeFFTtable, SizeFFTwork

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      NfftMax = max( Nfft(1) , Nfft(2) , Nfft(3) )

      Nfftdim(:) = Nfft(:)

      NLS(:) = Nfft(:) / 2

      if( Nfft(1) == 2 * NLS(1) ) Nfftdim(1) = Nfft(1) + 1
      if( Nfft(2) == 2 * NLS(2) ) Nfftdim(2) = Nfft(2) + 1
      if( Nfft(3) == 2 * NLS(3) ) Nfftdim(3) = Nfft(3) + 1

      NffTable     = 4 * NfftMax + 15
      NffWork      = NfftMax
      SizeFFTtable = 3 * NffTable
      SizeFFTwork  = 2 * NfftMax

      if ( .not. allocated( FFTtable ) ) &
     &   allocate( FFTtable(SizeFFTtable) )

      if( SizeFFTwork /= 0 ) then
        if ( .not. allocated( FFTwork ) ) &
     &     allocate( FFTwork(SizeFFTwork) )
      else
        if ( .not. allocated( FFTwork ) ) &
     &     allocate( FFTwork(1) )
      end if

      return
      end





!***********************************************************************
      subroutine PME_FFTsetup
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use mm_variables, only : Nfft, FFTtable, NffTable

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      integer :: n1, n2, n3

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      n1 = Nfft(1)
      n2 = Nfft(2)
      n3 = Nfft(3)

      call PME_pubz3di( n1, n2, n3, FFTtable, NffTable )

      return
      end





!***********************************************************************
      subroutine PME_FFT_forward( array )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use mm_variables, only : &
     &   Nfft, Nfftdim, FFTtable, FFTwork, NffTable, NffWork

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      real(8), dimension(*) :: array

      integer :: nnsign, n1, n2, n3, d1, d2

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      n1 = Nfft(1)
      n2 = Nfft(2)
      n3 = Nfft(3)

      d1 = Nfftdim(1)
      d2 = Nfftdim(2)

      nnsign = 1

      call PME_pubz3d( &
     &   nnsign, n1, n2, n3, array, d1, d2, FFTtable, NffTable, FFTwork, &
     &   NffWork )

      return
      end





!***********************************************************************
      subroutine PME_FFT_back( array )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use mm_variables, only : &
     &   Nfft, Nfftdim, FFTtable, FFTwork, NffTable, NffWork

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      real(8), dimension(*) :: array

      integer :: nnsign, n1, n2, n3, d1, d2

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      n1 = Nfft(1)
      n2 = Nfft(2)
      n3 = Nfft(3)

      d1 = Nfftdim(1)
      d2 = Nfftdim(2)

      nnsign = -1

      call PME_pubz3d( &
     &   nnsign, n1, n2, n3, array, d1, d2, FFTtable, NffTable, FFTwork, &
     &   NffWork )

      return
      end





!***********************************************************************
      subroutine PME_pubz3di( n1, n2, n3, table, ntable )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      integer :: n1, n2, n3

!     //   ntable should be 4*max(n1,n2,n3)+15
      integer :: ntable

      real(8), dimension(ntable,3) :: table

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      call cffti( n1, table(1,1) )
      call cffti( n2, table(1,2) )
      call cffti( n3, table(1,3) )

      return
      end





!***********************************************************************
      subroutine PME_pubz3d( &
     &   nnsign, n1, n2, n3, w, ld1, ld2, table, ntable, work, nwork )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : myrank_sub, nprocs_sub

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

!     //   nwork should be max(n1,n2,n3)
      integer :: nwork

!     //   ntable should be 4*max(n1,n2,n3) +15
      integer :: ntable

      integer :: n1, n2, n3, ld1, ld2, nnsign, i, j, k, Nas

      complex(8), dimension(ld1,ld2,n3) :: w
      complex(8), dimension(nwork) :: work

      real(8), dimension(ntable,3) :: table

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      Nas = nprocs_sub - myrank_sub

      if ( nnsign == -1 ) then

!     //   transform along X

         do k = Nas, n3, nprocs_sub

            do j = 1, n2

               do i = 1, n1
                  work(i) = w(i,j,k)
               end do

               call cfftf( n1, work, table(1,1) )

               do i = 1, n1
                  w(i,j,k) = work(i)
               end do

            end do

         end do

!        //   transform along Y

         do k = Nas, n3, nprocs_sub

            do i = 1, n1

               do j = 1,n2
                  work(j) = w(i,j,k)
               end do

               call cfftf( n2, work, table(1,2) )

               do j = 1, n2
                  w(i,j,k) = work(j)
               end do

           end do

         end do

         if ( nprocs_sub /= 1 ) &
     &      call PME_FFT_ChAxisF( w, n1, n2, n3, ld1, ld2 )

!        //   transform along Z

         do i = Nas, n1, nprocs_sub

            do j = 1, n2

               do k = 1,n3
                  work(k) = w(i,j,k)
               end do

               call cfftf( n3, work, table(1,3) )

               do k = 1, n3
                  w(i,j,k) = work(k)
               end do

            end do

         end do

      else

!     //   transform along Z

         do i = Nas, n1, nprocs_sub

            do j = 1, n2

               do k = 1, n3
                  work(k) = w(i,j,k)
               end do

               call cfftb( n3, work, table(1,3) )

               do k = 1, n3
                  w(i,j,k) = work(k)
               end do

            end do

         end do

         if ( nprocs_sub /= 1 ) &
     &      call PME_FFT_ChAxisB( w, n1, n2, n3, ld1, ld2 )

!        //   transform along X

         do k = Nas, n3, nprocs_sub

            do j = 1, n2

               do i = 1, n1
                  work(i) = w(i,j,k)
               end do

               call cfftb( n1, work, table(1,1) )

               do i = 1, n1
                  w(i,j,k) = work(i)
               end do

            end do

         end do

!        //   transform along Y

         do k = Nas, n3, nprocs_sub

            do i = 1, n1

               do j = 1,n2
                  work(j) = w(i,j,k)
               end do

               call cfftb( n2, work, table(1,2) )

               do j = 1, n2
                  w(i,j,k) = work(j)
               end do

            end do

         end do

      end if

      return
      end





!***********************************************************************
      subroutine PME_Prep_Atom_to_Mesh
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : nprocs_sub
      use mm_variables, only : NfftDim, Nscnt, Ndisp, Nrenum

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      include 'mpif.h'

      integer :: i, j, k, l, nz, icpu

      integer, dimension(nprocs_sub) :: Numnz

!-----------------------------------------------------------------------
!     /*   memory allocation                                          */
!-----------------------------------------------------------------------

      if ( .not. allocated( Nscnt ) ) &
     &   allocate( Nscnt(0:nprocs_sub-1) )
      if ( .not. allocated( Ndisp ) ) &
     &   allocate( Ndisp(0:nprocs_sub-1) )

      if ( .not. allocated( Nrenum ) ) &
     &   allocate( Nrenum(NfftDim(3)) )

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      i = NfftDim(3) / nprocs_sub
      Numnz = i

      j = mod( NfftDim(3), nprocs_sub )
      if( j /= 0 ) then
        do i = nprocs_sub-j+1, nprocs_sub
          NumNz(i) = NumNz(i) + 1
        end do
      end if

      l = 0

      do i = 1, nprocs_sub

         do j = 1, NumNz(i)

            l = l + 1
            nz = 0

            do k = 1, NfftDim(3)

               icpu = nprocs_sub - mod(k-1,nprocs_sub)

               if ( icpu == i ) then
                  nz = nz + 1
                  if ( nz == j ) then
                     Nrenum(l) = k
                     exit
                  end if
               end if

            end do

         end do

      end do

      do i = 1, nprocs_sub

         Nscnt(i-1) = Numnz(i) * NfftDim(2) * NfftDim(1)

      end do

      Ndisp(0) = 0

      do i = 1, nprocs_sub-1

        Ndisp(i) = Ndisp(i-1) + Nscnt(i-1)

      end do

      return
      end





!***********************************************************************
      subroutine PME_SumChargeDens( Q )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : myrank_sub, nprocs_sub, mpi_comm_sub

      use mm_variables, only : NfftDim, Nscnt, Ndisp, Nrenum

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      include 'mpif.h'

      integer :: i , j, k, l, kk, Nall, Nas, ierror

      real(8), dimension(NfftDim(1),NfftDim(2),NfftDim(3)) :: Q

      real(8), dimension(:), allocatable :: BUFF1,BUFF2

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      Nas = nprocs_sub - myrank_sub

      Nall = NfftDim(1) * NfftDim(2) * NfftDim(3)

      if ( .not. allocated(BUFF1 ) ) &
     &   allocate(BUFF1(Nall))
      if ( .not. allocated(BUFF2 ) ) &
     &   allocate(BUFF2(Nall))

      l = 0

      do k = 1, NfftDim(3)

         kk = Nrenum(k)

         do j = 1, NfftDim(2)

            do i = 1, NfftDim(1)

               l = l + 1
               Buff1(l) = Q(i,j,kk)

            end do

         end do

      end do

      call MPI_BARRIER( mpi_comm_sub, ierror )

      call MPI_REDUCE( Buff1, Buff2, Nall, MPI_DOUBLE_PRECISION, &
     &                 MPI_SUM, 0, mpi_comm_sub, ierror )


      call MPI_SCATTERV( Buff2, Nscnt, Ndisp, MPI_DOUBLE_PRECISION, &
     &                   Buff1, Nscnt(myrank_sub), MPI_DOUBLE_PRECISION, &
     &                   0, mpi_comm_sub, ierror )

      l = 0

      do k = Nas, NfftDim(3), nprocs_sub

         do j = 1, NfftDim(2)

            do i = 1, NfftDim(1)

               l = l + 1
               Q(i,j,k) = Buff1(l)

            end do

         end do

      end do

      return
      end





!***********************************************************************
      subroutine PME_DistChargeDens(Q)
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : myrank_sub, nprocs_sub, mpi_comm_sub

      use mm_variables, only : NfftDim, Nscnt, Ndisp, Nrenum

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      include 'mpif.h'

      integer :: i , j, k, l, kk, Nas, ierror, Nall

      real(8), dimension(NfftDim(1),NfftDim(2),NfftDim(3)) :: Q

      real(8), dimension(:), allocatable :: BUFF1, BUFF2

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      Nas = nprocs_sub - myrank_sub

      Nall = NfftDim(1) * NfftDim(2) * NfftDim(3)

      if ( .not. allocated(BUFF1 ) ) &
     &   allocate(BUFF1(Nall))
      if ( .not. allocated(BUFF2 ) ) &
     &   allocate(BUFF2(Nall))

      l = 0

      do k = Nas, NfftDim(3), nprocs_sub

         do j = 1, NfftDim(2)

            do i = 1, NfftDim(1)

               l = l + 1
               Buff1(l) = Q(i,j,k)

            end do

         end do

      end do

      call MPI_ALLGATHERV( Buff1, Nscnt(myrank_sub), &
     &                     MPI_DOUBLE_PRECISION, Buff2, Nscnt, Ndisp, &
     &                     MPI_DOUBLE_PRECISION, mpi_comm_sub, ierror )

      l = 0

      do k = 1, NfftDim(3)

         kk = Nrenum(k)

         do j = 1, NfftDim(2)

            do i = 1, NfftDim(1)

               l = l + 1
               Q(i,j,kk) = Buff2(l)

            end do

         end do

      end do

      return
      end





!***********************************************************************
      subroutine PME_FFT_ChAxisF( w, n1, n2, n3, ld1, ld2 )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : myrank_sub, nprocs_sub, mpi_comm_sub

      use mm_variables, only: MaxGrid

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      include 'mpif.h'

      integer :: i, j, k, ii, jj, kk, Nas, ireq, ierror
      integer :: n1, n2, n3, ld1, ld2

      integer, dimension(nprocs_sub) :: NSdata, NRdata
      integer, dimension(MPI_STATUS_SIZE) :: istatus

      real(8), dimension(2,ld1,ld2,n3) :: w
      real(8), dimension(MaxGrid,nprocs_sub) :: Dsend, Drecv
      real(8), dimension(MaxGrid) :: Vsend, Vrecv

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      Nas = nprocs_sub - myrank_sub

!     //   Send buffer

      NSdata = 0

      do k = Nas, n3, nprocs_sub

        do i = 1, n1

          ii = nprocs_sub - mod(i-1,nprocs_sub)   

          if(ii==(myrank_sub+1)) cycle

          do j = 1, n2

            jj = NSdata(ii)
            Dsend( jj+1, ii ) = w(1,i,j,k)
            Dsend( jj+2, ii ) = w(2,i,j,k)
            NSdata(ii) = NSdata(ii) + 2

          end do

        end do

      end do

!     //   Recieve Count

      NRdata = 0

      do i = Nas, n1, nprocs_sub

        do k = 1, n3

          kk = nprocs_sub - mod(k-1,nprocs_sub)

          if(kk==(myrank_sub+1)) cycle

          NRdata(kk) = NRdata(kk) + n2 * 2

        end do

      end do

!     //

      do i = 1, nprocs_sub

         ii = i - 1

         if ( NSdata(i) /= 0 ) then

            do j = 1, NSdata(i)
              Vsend(j) = Dsend(j,i)
            end do

            call MPI_ISEND( Vsend, NSdata(i), MPI_DOUBLE_PRECISION, &
     &                      ii, i, mpi_comm_sub, ireq, ierror )

         end if

         if ( NRdata(i) /= 0 ) then

            call MPI_RECV( Vrecv,NRdata(i), MPI_DOUBLE_PRECISION, ii, &
     &                     MPI_ANY_TAG, mpi_comm_sub, istatus, ierror )

            do j = 1, NRdata(i)
               Drecv(j,i) = Vrecv(j)
            end do

         end if

         if ( NSdata(i) /= 0 ) call MPI_WAIT( ireq, istatus, ierror )

      end do

      NRdata = 0

      do k = 1, n3

        kk = nprocs_sub - mod(k-1, nprocs_sub)

        if( kk == (myrank_sub + 1) ) cycle

        do i = Nas, n1, nprocs_sub

           do j = 1, n2

             jj = NRdata(kk)

             w(1,i,j,k) = Drecv( jj+1, kk )
             w(2,i,j,k) = Drecv( jj+2, kk )

             NRdata(kk) = NRdata(kk) + 2

           end do

         end do

      end do

      return
      end





!***********************************************************************
      subroutine PME_FFT_ChAxisB( w, n1, n2, n3, ld1, ld2 )
!***********************************************************************

!-----------------------------------------------------------------------
!     /*   shared variables                                           */
!-----------------------------------------------------------------------

      use common_variables, only : myrank_sub, nprocs_sub, mpi_comm_sub

      use mm_variables, only: MaxGrid

!-----------------------------------------------------------------------
!     /*   local variables                                            */
!-----------------------------------------------------------------------

      implicit none

      include 'mpif.h'

      integer :: n1, n2, n3, ld1, ld2
      integer :: i, j, k, ii, jj, kk, Nas, ireq, ierror

      integer, dimension(MPI_STATUS_SIZE) :: istatus
      integer, dimension(nprocs_sub) :: NSdata, NRdata

      real(8), dimension(2,ld1,ld2,n3) :: w
      real(8), dimension(MaxGrid,nprocs_sub) :: Dsend, Drecv
      real(8), dimension(MaxGrid) :: Vsend, Vrecv

!-----------------------------------------------------------------------
!     /*   main part                                                  */
!-----------------------------------------------------------------------

      Nas = nprocs_sub - myrank_sub

!     //   Send buffer

      NSdata = 0

      do i = Nas, n1, nprocs_sub

         do k = 1, n3

            kk = nprocs_sub - mod(k-1,nprocs_sub) 

            if( kk == (myrank_sub+1) ) cycle

            do j = 1, n2

               jj = NSdata(kk)
               Dsend( jj+1, kk ) = w(1,i,j,k)
               Dsend( jj+2, kk ) = w(2,i,j,k)
               NSdata(kk) = NSdata(kk) + 2

            end do

         end do

      end do

!     //   Recieve Count

      NRdata = 0

      do k = Nas, n3, nprocs_sub

         do i = 1, n1

            ii = nprocs_sub - mod(i-1,nprocs_sub)

            if ( ii == (myrank_sub+1) ) cycle

            NRdata(ii) = NRdata(ii) + n2 * 2

         end do

      end do

!     //

      do i = 1, nprocs_sub

         ii = i - 1

         if ( NSdata(i) /= 0 ) then

            do j = 1, NSdata(i)
               Vsend(j) = Dsend(j,i)
            end do

            call MPI_ISEND( Vsend, NSdata(i), MPI_DOUBLE_PRECISION, ii, &
     &                      i, mpi_comm_sub, ireq, ierror )

         end if

         if ( NRdata(i) /= 0 ) then

            call MPI_RECV( Vrecv, NRdata(i), MPI_DOUBLE_PRECISION, ii, &
     &                     MPI_ANY_TAG, mpi_comm_sub, istatus, ierror )

            do j = 1, NRdata(i)
               Drecv(j,i) = Vrecv(j)
            end do

          end if

          if ( NSdata(i) /= 0 ) call MPI_WAIT( ireq, istatus, ierror )

      end do

!     //

      NRdata = 0

      do i = 1, n1

         ii = nprocs_sub - mod(i-1, nprocs_sub)

         if ( ii == (myrank_sub + 1) ) cycle

         do k = Nas, n3, nprocs_sub

            do j = 1, n2

               jj = NRdata(ii)

               w(1,i,j,k) = Drecv( jj+1, ii )
               w(2,i,j,k) = Drecv( jj+2, ii )

               NRdata(ii) = NRdata(ii) + 2

            end do

         end do

      end do

      return
      end

#endif

#ifdef nopme

!***********************************************************************
      subroutine force_pmeewald_setup_MPI
!***********************************************************************

      use common_variables, only : myrank

      implicit none

      if ( myrank .eq. 0 ) then

         write( 6, '(a)' ) 'Error termination - PME is not compiled.'
         write( 6, '(a)' )
         write( 6, '(a)' ) 'Recompile pimd.mpi.x with -Dpme option.'
         write( 6, '(a)' )

      end if

      call my_mpi_finalize_2

      stop
      end





!***********************************************************************
      subroutine force_pmeewald_fs_MPI
!***********************************************************************

      use common_variables, only : myrank

      implicit none

      if ( myrank .eq. 0 ) then

         write( 6, '(a)' ) 'Error termination - PME is not compiled.'
         write( 6, '(a)' )
         write( 6, '(a)' ) 'Recompile pimd.mpi.x with -Dpme option.'
         write( 6, '(a)' )

      end if

      call my_mpi_finalize_2

      stop
      end

#endif
