!///////////////////////////////////////////////////////////////////////
!
!      Author:          M. Shiga, H. Kimizuka
!      Last updated:    Jan 7, 2019 by M. Shiga
!      Description:     three-stage parallelization
!
!///////////////////////////////////////////////////////////////////////
!=======================================================================
!
!     The following subroutines were added to support three-stages
!     MPI parallelization
!
!=======================================================================
!***********************************************************************
      subroutine my_mpi_init_3
!***********************************************************************

      use common_variables, only : &
     &   nprocs_world, myrank_world, mpi_group_world, myrank, nprocs, &
     &   nprocs_pimd, myrank_pimd

!      use common_variables, only : myrank_main, myrank_sub
!      use polymers_variables, only : myrank_top

      implicit none

      integer :: ierr

      include 'mpif.h'

      call MPI_INIT ( ierr )

      call MPI_COMM_SIZE ( MPI_COMM_WORLD, nprocs_world, ierr )
      call MPI_COMM_RANK ( MPI_COMM_WORLD, myrank_world, ierr )

!     /*   get nprocs_main and nprocs_sub   */
      call get_nprocs_3

!     /*   get group number of MPI_COMM_WORLD   */
      call MPI_COMM_GROUP ( MPI_COMM_WORLD, mpi_group_world, ierr )

!     /*   set sub groups   */
      call setup_mpi_sub_group_3

!     /*   set main groups   */
      call setup_mpi_main_group_3

!     /*   set pimd groups   */
      call setup_mpi_pimd_group_3

!     /*   set top groups   */
      call setup_mpi_top_group_3

!     /*   mpi barrier   */
      call my_mpi_barrier

!-----------------------------------------------------------------------
!     /*    number of processors in main and sub groups               */
!-----------------------------------------------------------------------

      nprocs = nprocs_pimd
      myrank = myrank_pimd

!      write( 6, '(5i3)' )
!     &   myrank_world, myrank_top, myrank_main, myrank_sub, myrank_pimd

      return
      end





!-----------------------------------------------------------------------
!     /*   example with twelve cores                                  */
!-----------------------------------------------------------------------
!
!     nprocs_world      = 12
!     npoly             = 3
!     nbead             = 2
!     nprocs_sub        = 2   = np_force
!     nprocs_main       = 2   = np_beads
!     nprocs_top        = 3   = np_poly
!     nprocs_pimd       = 4   = np_beads * np_force
!
!-----------------------------------------------------------------------
!     myrank_world      = 0  1  2  3  4  5  6  7  8  9 10 11
!-----------------------------------------------------------------------
!     ranklist_sub(1)   = 0  0  2  2  4  4  6  6  8  8 10 10
!     ranklist_sub(2)   = 1  1  3  3  5  5  7  7  9  9 11 11
!     myrank_sub        = 0  1  0  1  0  1  0  1  0  1  0  1
!-----------------------------------------------------------------------
!
!-----------------------------------------------------------------------
!     myrank_world      = 0  1  2  3  4  5  6  7  8  9 10 11
!-----------------------------------------------------------------------
!     ranklist_main(1)  = 0  1  0  1  4  5  4  5  8  9  8  9
!     ranklist_main(2)  = 2  3  2  3  6  7  6  7 10 11 10 11
!     myrank_main       = 0  0  1  1  0  0  1  1  0  0  1  1
!-----------------------------------------------------------------------
!
!-----------------------------------------------------------------------
!     myrank_world      = 0  1  2  3  4  5  6  7  8  9 10 11
!-----------------------------------------------------------------------
!     ranklist_top(1)   = 0  1  2  3  0  1  2  3  0  1  2  3
!     ranklist_top(2)   = 4  5  6  7  4  5  6  7  4  5  6  7
!     ranklist_top(3)   = 8  9 10 11  8  9 10 11  8  9 10 11
!     myrank_top        = 0  0  0  0  1  1  1  1  2  2  2  2
!-----------------------------------------------------------------------
!
!-----------------------------------------------------------------------
!     myrank_world      = 0  1  2  3  4  5  6  7  8  9 10 11
!-----------------------------------------------------------------------
!     ranklist_pimd(1)  = 0  0  0  0  4  4  4  4  8  8  8  8
!     ranklist_pimd(2)  = 1  1  1  1  5  5  5  5  9  9  9  9
!     ranklist_pimd(3)  = 2  2  2  2  6  6  6  6 10 10 10 10
!     ranklist_pimd(3)  = 3  3  3  3  7  7  7  7 11 11 11 11
!     myrank_pimd       = 0  1  2  3  0  1  2  3  0  1  2  3
!     myrank            = 0  1  2  3  0  1  2  3  0  1  2  3
!-----------------------------------------------------------------------

!***********************************************************************
      subroutine get_nprocs_3
!***********************************************************************
!=======================================================================
!
!     This subroutine returns np_beads and np_force
!
!=======================================================================

      use common_variables, only : &
     &   nbead, iounit, np_beads, np_force, np_cycle, nprocs_pimd, &
     &   myrank_world, nprocs_world, nprocs_main, nprocs_sub

      use polymers_variables, only : &
     &   npoly, np_sweep, np_poly, np_pimd, nprocs_top

      implicit none

      integer :: ierr

!-----------------------------------------------------------------------
!     /*    read number of polymers                                   */
!-----------------------------------------------------------------------

      call read_int1_MPI( npoly, '<npoly>', 7, iounit )

!-----------------------------------------------------------------------
!     /*    read number of beads                                      */
!-----------------------------------------------------------------------

      call read_int1_MPI( nbead, '<nbead>', 7, iounit )

!-----------------------------------------------------------------------
!     /*    read number of polymers                                   */
!-----------------------------------------------------------------------

      call read_int1_MPI( np_poly, '<np_poly>', 9, iounit )

      ierr = 0

      if ( np_poly .gt. npoly ) then

         if ( myrank_world .eq. 0 ) then
            write ( 6, '(a,2i6)' ) &
     &     'Error - np_poly should be less than npoly', &
     &        np_poly, npoly
            write ( 6, '(a)' )
         end if

         ierr = 1

      end if

      call error_handling_MPI &
     &    ( ierr, 'subroutine communicate_3_MPI', 28 )

!-----------------------------------------------------------------------
!     /*    pimd parallelization parameter                            */
!-----------------------------------------------------------------------

      np_pimd  = nprocs_world / np_poly

      if ( mod(nprocs_world,np_poly) .ne. 0 ) then

         ierr = 1

         if ( myrank_world .eq. 0 ) then
            write ( 6, '(a)' ) &
     &        'Error - np_poly should be a divisor of parallel number.'
            write ( 6, '(a)' )
         end if

      end if

      call error_handling_MPI &
     &    ( ierr, 'subroutine communicate_3_MPI', 28 )

!-----------------------------------------------------------------------
!     /*    polymer sweeps per step                                   */
!-----------------------------------------------------------------------

      if ( mod(npoly,np_poly) .eq. 0 ) then
         np_sweep = npoly / np_poly
      else
         np_sweep = npoly / np_poly + 1
      end if

!-----------------------------------------------------------------------
!     /*    bead parallelization parameter                            */
!-----------------------------------------------------------------------

      np_beads = min(nbead,np_pimd)

!-----------------------------------------------------------------------
!     /*    force parallelization parameter                           */
!-----------------------------------------------------------------------

      np_force = nprocs_world / np_beads / np_poly

!-----------------------------------------------------------------------
!     /*    force cycles per step                                     */
!-----------------------------------------------------------------------

      if ( mod(nbead,np_beads) .eq. 0 ) then
         np_cycle = nbead / np_beads
      else
         np_cycle = nbead / np_beads + 1
      end if

!-----------------------------------------------------------------------
!     /*    print                                                     */
!-----------------------------------------------------------------------

      if( myrank_world .eq. 0 ) then

         write ( 6, '(a)' ) 

         write ( 6, '(a)' )  &
     &      '_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/' // &
     &      '_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/'

         write ( 6, '(a)' )

         write( 6, '(a)' )    'Information of parallel computation:'
         write( 6, '(a)' )
         write( 6, '(a,i6)' ) &
     &      '  Number of processors     = ', nprocs_world
         write( 6, '(a,i6)' ) &
     &      '  Number of polymers       = ', npoly
         write( 6, '(a,i6)' ) &
     &      '  Number of beads          = ', nbead
         write( 6, '(a,i6)' ) &
     &      '  Polymer parallelization  = ', np_poly
         write( 6, '(a,i6)' ) &
     &      '  Bead parallelization     = ', np_beads
         write( 6, '(a,i6)' ) &
     &      '  Force parallelization    = ', np_force
         write( 6, '(a,i6)' ) &
     &      '  Polymer cycles per step  = ', np_sweep
         write( 6, '(a,i6)' ) &
     &      '  Force cycles per step    = ', np_cycle
         write( 6, '(a)' )

         if ( mod(npoly,np_poly) .ne. 0 ) then

            write ( 6, '(a,i6,a,i6)' ) &
     &         'Warning - npoly / np_poly is not an integer:', &
     &         npoly, ' / ', np_poly

            write ( 6, '(a)' )

         end if

         if ( mod(nbead,np_beads) .ne. 0 ) then

            write ( 6, '(a,i6,a,i6)' ) &
     &         'Warning - nbead / np_beads is not an integer:', &
     &         nbead, ' / ', np_beads

            write ( 6, '(a)' )

         end if

      end if

!-----------------------------------------------------------------------
!     /*    check                                                     */
!-----------------------------------------------------------------------

      if( (np_poly*np_beads*np_force) .eq. nprocs_world ) then

         ierr = 0

      else

         if( myrank_world .eq. 0 ) then
            write ( 6, '(a,i6)' ) &
     &     'Error - nprocs_world should be np_poly*p_beads*np_force = ', &
     &        np_poly*np_beads*np_force
            write ( 6, '(a)' )
         end if

         ierr = 1

      end if

!-----------------------------------------------------------------------
!     /*    error termination                                         */
!-----------------------------------------------------------------------

      call error_handling_MPI &
     &    ( ierr, 'subroutine communicate_3_MPI', 28 )

!-----------------------------------------------------------------------
!     /*    print                                                     */
!-----------------------------------------------------------------------

      if( myrank_world .eq. 0 ) then

          write ( 6, '(a)')  &
     &       '_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/' // &
     &       '_/_/_/_/_/_/_/_/_/_/_/_/_/_/_/'

      end if

!-----------------------------------------------------------------------
!     /*    number of processors in main and sub groups               */
!-----------------------------------------------------------------------

      nprocs_top  = np_poly
      nprocs_main = np_beads
      nprocs_sub  = np_force
      nprocs_pimd = np_beads * np_force

      return
      end





!***********************************************************************
      subroutine setup_mpi_pimd_group_3
!***********************************************************************

      use common_variables, only : &
     &   mpi_group_world, myrank_world, &
     &   nprocs_pimd, mpi_comm_pimd, mpi_group_pimd, myrank_pimd

!      use common_variables, only : nprocs_world

      implicit none

      integer, allocatable :: ranklist_pimd(:)

      integer :: ierr, i
!      integer :: j, k

      include 'mpif.h'

      allocate( ranklist_pimd(nprocs_pimd) )

      do i = 1, nprocs_pimd
         ranklist_pimd(i) &
     &      = int(myrank_world/nprocs_pimd)*nprocs_pimd  + i-1
      end do

      call MPI_GROUP_INCL &
     &   ( mpi_group_world, nprocs_pimd, ranklist_pimd, &
     &                      mpi_group_pimd, ierr )

      call MPI_COMM_CREATE ( MPI_COMM_WORLD, mpi_group_pimd, &
     &                       mpi_comm_pimd, ierr )

      call MPI_COMM_RANK ( mpi_comm_pimd, myrank_pimd, ierr )

!     //   check
!      do j = 1, nprocs_world
!         if ( j-1 .eq. myrank_world ) then
!            write( 6, '(13i3)' )
!     &         j, ( ranklist_pimd(k), k = 1, nprocs_pimd )
!         end if
!         call MPI_BARRIER( MPI_COMM_WORLD, ierr )
!      end do
!      do j = 1, nprocs_world
!         if ( j-1 .eq. myrank_world ) then
!            write( 6, '(2i3)' ) myrank_world, myrank_pimd
!         end if
!         call MPI_BARRIER( MPI_COMM_WORLD, ierr )
!      end do

      deallocate( ranklist_pimd )

      return
      end





!***********************************************************************
      subroutine setup_mpi_main_group_3
!***********************************************************************

      use common_variables, only : &
     &   nprocs_main, nprocs_sub, myrank_main, myrank_world, &
     &   mpi_group_main, mpi_group_world, mpi_comm_main, nprocs_pimd

!      use common_variables, only : nprocs_world

      implicit none

      integer, allocatable :: ranklist_main(:)

      integer :: ierr, i
!      integer :: j, k

      include 'mpif.h'

      allocate( ranklist_main(nprocs_main) )

      do i = 1, nprocs_main
         ranklist_main(i) = nprocs_sub*(i-1) &
     &                    + mod(myrank_world,nprocs_sub) &
     &                    + int(myrank_world/nprocs_pimd)*nprocs_pimd
      end do

      call MPI_GROUP_INCL ( mpi_group_world, nprocs_main, ranklist_main, &
     &                      mpi_group_main, ierr )

      call MPI_COMM_CREATE ( MPI_COMM_WORLD, mpi_group_main, &
     &                       mpi_comm_main, ierr )

      call MPI_COMM_RANK ( mpi_comm_main, myrank_main, ierr )

!     //   check
!      do j = 1, nprocs_world
!         if ( j-1 .eq. myrank_world ) then
!            write( 6, '(13i3)' )
!     &         j, ( ranklist_main(k), k = 1, nprocs_main )
!         end if
!         call MPI_BARRIER( MPI_COMM_WORLD, ierr )
!      end do

      deallocate( ranklist_main )

      return
      end





!***********************************************************************
      subroutine setup_mpi_sub_group_3
!***********************************************************************

      use common_variables, only : &
     &   nprocs_sub, mpi_group_sub, mpi_group_world, myrank_world, &
     &   mpi_comm_sub, myrank_sub

!      use common_variables, only : nprocs_world

      implicit none

      integer, allocatable :: ranklist_sub(:)

      integer :: ierr, i
!      integer :: j, k

      include 'mpif.h'

      allocate( ranklist_sub(nprocs_sub) )

      do i = 1, nprocs_sub
         ranklist_sub(i) &
     &      = int(myrank_world/nprocs_sub)*nprocs_sub  + i-1
      end do

      call MPI_GROUP_INCL ( mpi_group_world, nprocs_sub, ranklist_sub, &
     &                      mpi_group_sub, ierr )

      call MPI_COMM_CREATE ( MPI_COMM_WORLD, mpi_group_sub, &
     &                       mpi_comm_sub, ierr )

      call MPI_COMM_RANK ( mpi_comm_sub, myrank_sub, ierr )

!     //   check
!      do j = 1, nprocs_world
!         if ( j-1 .eq. myrank_world ) then
!            write( 6, '(13i3)' )
!     &         j, ( ranklist_sub(k), k = 1, nprocs_sub )
!         end if
!         call MPI_BARRIER( MPI_COMM_WORLD, ierr )
!      end do

      deallocate( ranklist_sub )

      return
      end





!***********************************************************************
      subroutine setup_mpi_top_group_3
!***********************************************************************

      use common_variables, only : &
     &   mpi_group_world, myrank_world, nprocs_pimd

      use polymers_variables, only : &
     &   nprocs_top, mpi_comm_top, mpi_group_top, myrank_top

!      use common_variables, only : nprocs_world

      implicit none

      integer, allocatable :: ranklist_top(:)

      integer :: ierr, i
!      integer :: j, k

      include 'mpif.h'

      allocate( ranklist_top(nprocs_top) )

      do i = 1, nprocs_top
         ranklist_top(i) = nprocs_pimd*(i-1) &
     &                   + mod(myrank_world,nprocs_pimd)
      end do

      call MPI_GROUP_INCL &
     &   ( mpi_group_world, nprocs_top, ranklist_top, &
     &                      mpi_group_top, ierr )

      call MPI_COMM_CREATE ( MPI_COMM_WORLD, mpi_group_top, &
     &                       mpi_comm_top, ierr )

      call MPI_COMM_RANK ( mpi_comm_top, myrank_top, ierr )

!     //   check
!      do j = 1, nprocs_world
!         if ( j-1 .eq. myrank_world ) then
!            write( 6, '(13i3)' )
!     &         j, ( ranklist_top(k), k = 1, nprocs_top )
!         end if
!         call MPI_BARRIER( MPI_COMM_WORLD, ierr )
!      end do

      deallocate( ranklist_top )

      return
      end





!***********************************************************************
      subroutine my_mpi_allreduce_real_1_top ( a, n )
!***********************************************************************

      use polymers_variables, only : mpi_comm_top

      implicit none

      integer :: i, n, ierr
      real(8) :: a(n), b1(n), b2(n)

      include 'mpif.h'

      do i = 1, n
         b1(i) = a(i)
      end do

      call MPI_ALLREDUCE ( b1, b2, n, MPI_DOUBLE_PRECISION, &
     &                     MPI_SUM, mpi_comm_top, ierr )

      do i = 1, n
         a(i) = b2(i)
      end do

      return
      end





!***********************************************************************
      subroutine my_mpi_allreduce_real_2_top ( a, n1, n2 )
!***********************************************************************

      use polymers_variables, only : mpi_comm_top

      implicit none

      integer :: i, j, k, n, n1, n2, ierr
      real(8) :: a(n1,n2), b1(n1*n2), b2(n1*n2)

      include 'mpif.h'

      k = 0
      do j = 1, n2
      do i = 1, n1
         k = k + 1
         b1(k) = a(i,j)
      end do
      end do

      n = n1*n2

      call MPI_ALLREDUCE ( b1, b2, n, MPI_DOUBLE_PRECISION, &
     &                     MPI_SUM, mpi_comm_top, ierr )

      k = 0
      do j = 1, n2
      do i = 1, n1
         k = k + 1
         a(i,j) = b2(k)
      end do
      end do

      return
      end
