!-------------------------------------------------------------------------
!         NASA/GSFC, Data Assimilation Office, Code 910.3, GEOS/DAS
!-------------------------------------------------------------------------

      MODULE decompmodule 34
!BOP
!
! !MODULE: decompmodule
!
! !USES:
#if defined( STAND_ALONE )
# define iulog 6
#else
      use cam_logfile, only: iulog
#endif
#include "debug.h"

      IMPLICIT NONE

!
! !DESCRIPTION:
!
!      This module provides the DecompType and its create and destroy 
!      routines.
!      \begin{center}
!      \begin{tabular}{|l|l|} \hline \hline
!         DecompType        & Type to describe a decomposition \\ \hline
!         DecompDefined     & True iff given decomposition is defined\\ \hline
!         DecompFree        & Destroy a decomposition \\ \hline
!         DecompCopy        & Copy decomposition to newly created one\\ \hline 
!         DecompPermute     & Permute decomposition \\ \hline 
!         DecompRegular1D   & Create a 1-D decomposition \\ \hline 
!         DecompRegular2D   & Create a 2-D decomposition \\ \hline 
!         DecompRegular3D   & Create a 3-D decomposition \\ \hline 
!         DecompRegular4D   & Create a 4-D decomposition \\ \hline 
!         DecompCreateIrr   & Create an irregular 1-D decomposition \\ \hline
!         DecompCreateTags  & Create a decomposition from Pe and Tags \\ \hline
!         DecompGlobalToLocal& Map a global index to a local one \\ \hline
!         DecompLocalToGlobal& Map a local index to a global one \\
!         \hline  \hline
!      \end{tabular}
!      \end{center}
!
!      The decomposition type contains the sizes of the global array,
!      the number of entries on each PE, and for each PE a list
!      of "runs", i.e., the starting and finishing global indices
!      or "tags" whose inclusive array section resides on that PE.
!      Clearly this method of decomposition is only efficient if
!      there are long runs, i.e., long array sections which are 
!      mapped to one PE.  A random decomposition will cause poor
!      results.
!
!      The decomposition is thus very efficient for 1-D, 2-D or 3-D
!      block distributions (particularly for 1-D distributions, where
!      there is one "run" per processor).  Problems may occur for
!      an irregular decomposition (which is by definition 1-D).  If
!      there is little correspondence between the global indices of the 
!      entries and the actual decomposition (e.g., the tags are
!      assigned randomly), then there will be many runs, most
!      containing only one tag, and the resulting instance of
!      DecompType will be very large.  Fortunately, most applications
!      assign tags to entries in some sort of contiguous fashion, 
!      which is then quite appropriate for this data structure.
!
!      All numbering of multi-dimensional arrays is ROW-MAJOR, that
!      is, first in the X direction and then in the Y (and then,
!      if appropriate, in Z).  This is true for both the 2-D and
!      3-D data sets as also the Cartesian description of the PEs.
!
!      There is one glaring feature of DecompType.  It is
!      supposed to be a `one-size-fits-all' description of the
!      decomposition (with the exception of the random indexing
!      mentioned above).  Unfortunately, to describe 2-D and 3-D
!      regions, it is necessary to carry additional dimension
!      information in order have complete information for the 
!      mapping.  This means that 2-D and 3-D decompositions 
!      inherently carry more information than a 1-D decomposition.
!      Thus it {\it is} possible to use a decomposition created
!      with the Regular2D or Regular3D routines to describe the
!      corresponding decomposition when the 2-D or 3-D array is
!      viewed as a 1-D array, but it is clearly {\it not}
!      possible to use a decomposition created with Regular1D
!      to describe the decomposition of a 2-D or 3-D array
!      --- the appropriate information just is not there.
!
! !REVISION HISTORY:
!   97.07.22   Sawyer     Creation
!   97.09.01   Sawyer     Release date
!   97.11.06   Sawyer     Addition of row and column communicators
!   97.01.24   Sawyer     Added support for non-MPI derived types solution
!   97.01.29   Sawyer     Minor revisions for production service
!   98.01.30   Sawyer     Added DecompCopy
!   98.02.04   Sawyer     Removed Comm, CommRow and CommCol from DecompType
!   98.03.13   Sawyer     Removed DecompTypeOld, brushed up for walkthrough
!   98.03.19   Sawyer     Minor corrections after walkthrough
!   98.05.02   Sawyer     Added DecompPermute
!   98.05.11   Sawyer     Removed Permutation from all but DecompPermute
!   99.01.19   Sawyer     Minor cleaning
!   00.07.07   Sawyer     Removed DimSizes; decomp is now 1D only
!   00.11.12   Sawyer     Added DecompCreateTags and DecompInfo
!   01.02.03   Sawyer     Updated for free format; corrected DecompCreateTags
!   01.03.20   Sawyer     Added DecompRegular3DOrder
!   02.12.04   Sawyer     Added DecompDefined, optimized DecompGlobalToLocal
!   02.12.06   Sawyer     Bug in new DecompGlobalToLocal (remove out of bounds check)
!   02.12.08   Sawyer     Another bug: calculate the Offsets field correctly
!   02.12.23   Sawyer     Added DecompRegular4D
!
! !PUBLIC TYPES:
      PUBLIC DecompType, DecompCreate, DecompFree
      PUBLIC DecompCopy, DecompPermute, DecompDefined
      PUBLIC DecompGlobalToLocal, DecompLocalToGlobal, DecompInfo


      INTERFACE     DecompCreate 28
        MODULE PROCEDURE DecompRegular1D
        MODULE PROCEDURE DecompRegular2D
        MODULE PROCEDURE DecompRegular3D
        MODULE PROCEDURE DecompRegular3DOrder
        MODULE PROCEDURE DecompRegular4D
        MODULE PROCEDURE DecompCreateIrr
        MODULE PROCEDURE DecompCreateTags
      END INTERFACE


      INTERFACE     DecompGlobalToLocal 17
        MODULE PROCEDURE DecompG2L
        MODULE PROCEDURE DecompG2LVector
      END INTERFACE
 

      INTERFACE     DecompLocalToGlobal
        MODULE PROCEDURE DecompL2G
        MODULE PROCEDURE DecompL2GVector
      END INTERFACE
 
! Decomposition info

       TYPE Lists
         INTEGER, POINTER     :: StartTags(:) ! Start of tag run
         INTEGER, POINTER     :: EndTags(:)   ! End of tag run
         INTEGER, POINTER     :: Offsets(:)   ! Local offsets for efficiency
       END TYPE Lists

       TYPE DecompType
         LOGICAL              :: Defined      ! Is it defined?
         INTEGER              :: GlobalSize   ! Size in each dimension
         INTEGER, POINTER     :: NumEntries(:)! Number of entries per PE
         TYPE(Lists), POINTER :: Head(:)      ! Array of pointers 
       END TYPE DecompType

!EOP
      CONTAINS

!-----------------------------------------------------------------------
!BOP
! !IROUTINE: DecompDefined --- Is the decomp type defined?
!
! !INTERFACE:

      LOGICAL FUNCTION DecompDefined ( Decomp )
! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      TYPE(DecompType), INTENT( IN ):: Decomp  ! Decomp information

!
! !DESCRIPTION:
!     Returns true if Decomp has been created but not yet destroyed
!
! !REVISION HISTORY:
!   02.12.04   Sawyer     Creation from GhostDefined
!
!EOP
!-----------------------------------------------------------------------
!BOC
!
!
      CPP_ENTER_PROCEDURE( "DECOMPDEFINED" )
      DecompDefined = Decomp%Defined
      CPP_LEAVE_PROCEDURE( "DECOMPDEFINED" )

      RETURN
!EOC
      END FUNCTION DecompDefined
!-----------------------------------------------------------------------


!---------------------------------------------------------------------
!BOP
! !IROUTINE: DecompFree --- Free a decomposition
!
! !INTERFACE:

      SUBROUTINE DecompFree ( Decomp ) 16
! !USES:
      IMPLICIT NONE

! !INPUT/OUTPUT PARAMETERS:
      TYPE(DecompType), INTENT( INOUT ):: Decomp  ! Decomp information
!
! !DESCRIPTION:
!     Free the decomposition -- deallocate the data structures.
!
! !SYSTEM ROUTINES:
!     ASSOCIATED, DEALLOCATE
!
! !REVISION HISTORY:
!   98.01.30   Sawyer     Creation
!
!EOP
!------------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER  :: I, NPEs
!
      CPP_ENTER_PROCEDURE( "DECOMPFREE" )

      IF ( ASSOCIATED( Decomp%NumEntries ) )                             &
                     DEALLOCATE( Decomp%NumEntries )
      IF ( ASSOCIATED( Decomp%Head ) ) THEN
        NPEs = SIZE( Decomp%Head )
        DO I = 1, NPEs
!
! Copy the number of entries on each PE
!
          IF ( ASSOCIATED( Decomp%Head(I)%StartTags ) )                  &
                     DEALLOCATE( Decomp%Head(I)%StartTags )
          IF ( ASSOCIATED( Decomp%Head(I)%EndTags ) )                    &
                     DEALLOCATE( Decomp%Head(I)%EndTags )
          IF ( ASSOCIATED( Decomp%Head(I)%Offsets ) )                    &
                     DEALLOCATE( Decomp%Head(I)%Offsets )
        ENDDO
        DEALLOCATE( Decomp%Head )
      ENDIF
      Decomp%Defined = .FALSE.

      CPP_LEAVE_PROCEDURE( "DECOMPFREE" )
      RETURN
!EOC
      END SUBROUTINE DecompFree
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompCopy --- Copy one decomposition to another
!
! !INTERFACE:

      SUBROUTINE DecompCopy ( DecompIn, DecompOut ) 8
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      TYPE(DecompType), INTENT( IN )   :: DecompIn  ! Decomp information
!
! !OUTPUT PARAMETERS:
      TYPE(DecompType), INTENT( OUT )  :: DecompOut ! Decomp information
!
! !DESCRIPTION:
!
!   Creates an output decomposition and copies the DecompIn input values 
!
! !SYSTEM ROUTINES:
!     ALLOCATE
!
! !REVISION HISTORY:
!   98.01.30   Sawyer     Creation
!
!EOP
!------------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER  :: I, J, NDims, NPEs, NRuns
!
      CPP_ENTER_PROCEDURE( "DECOMPCOPY" )
!
! Copy the size of the global array
!
      DecompOut%GlobalSize = DecompIn%GlobalSize

!
! Allocate the number of entries and list head arrays
!
      NPEs = SIZE( DecompIn%NumEntries )
      CPP_ASSERT_F90( SIZE( DecompIn%Head ) .EQ. NPEs )
      ALLOCATE( DecompOut%NumEntries( NPEs ) )
      ALLOCATE( DecompOut%Head( NPEs ) )

      DO I = 1, NPEs
!
! Copy the number of entries on each PE
!
        DecompOut%NumEntries( I ) = DecompIn%NumEntries( I )
        NRuns = SIZE( DecompIn%Head( I )%StartTags )
        CPP_ASSERT_F90( SIZE( DecompIn%Head( I )%EndTags ) .EQ. NRuns )
!
! Allocate and copy the array of runs
!
        ALLOCATE( DecompOut%Head(I)%StartTags( NRuns ) )
        ALLOCATE( DecompOut%Head(I)%EndTags( NRuns ) )
        ALLOCATE( DecompOut%Head(I)%Offsets( NRuns ) )
        DO J = 1, NRuns
          DecompOut%Head(I)%StartTags(J) = DecompIn%Head(I)%StartTags(J)
          DecompOut%Head(I)%EndTags(J) = DecompIn%Head(I)%EndTags(J)
          DecompOut%Head(I)%Offsets(J) = DecompIn%Head(I)%Offsets(J)
        ENDDO
      ENDDO
      DecompOut%Defined = .TRUE.

      CPP_LEAVE_PROCEDURE( "DECOMPCOPY" )
      RETURN
!EOC
      END SUBROUTINE DecompCopy
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompPermute --- Permute one decomposition to another
!
! !INTERFACE:

      SUBROUTINE DecompPermute ( Permutation, Decomp )
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      INTEGER :: Permutation( : )                  ! Permutation

! !INPUT/OUTPUT PARAMETERS:
      TYPE(DecompType), INTENT( INOUT ) :: Decomp  ! Decomp information
!
!
! !DESCRIPTION:
!
!   Permutes the PE assignment of a given decomposition. Confusion will
!   always arise about whether this is a forward or backward
!   transformation.  Picture it this way: draw the array and slice it
!   up as indicated by the distribution.  The resulting boxes are of
!   course indexed by natural numbering 1, 2, 3, 4, ...  (these are
!   the virtual one-based PEs). Now write the true PE numbering
!   (one-based) as you would like it.  The resulting array is Perm.
!
!
! !SYSTEM ROUTINES:
!     ALLOCATE
!
! !REVISION HISTORY:
!   98.05.02   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER, POINTER     :: NumEntries(:)! Number of entries
      TYPE(Lists), POINTER :: Head(:)      ! Array of pointers 
      INTEGER              :: I, J, NPEs, NRuns, TruePE
!
      CPP_ENTER_PROCEDURE( "DECOMPPERMUTE" )
!
! Allocate the number of entries and list head arrays
!
      NPEs = SIZE( Decomp%NumEntries )
      ALLOCATE( NumEntries( NPEs ) )
      DO I = 1, NPEs
        TruePE = Permutation( I )
        NumEntries( TruePE ) = Decomp%NumEntries( I )
      ENDDO
!
! Deallocate old NumEntries and put the new pointer in its place
!
      DEALLOCATE( Decomp%NumEntries )
      Decomp%NumEntries => NumEntries
      NULLIFY( NumEntries )

!
! Allocate and set the permuted Lists called with pointer Head
!
      ALLOCATE( Head( NPEs ) )
      DO I = 1, NPEs
        TruePE = Permutation( I )
        NRuns = SIZE( Decomp%Head(I)%StartTags )
        CPP_ASSERT_F90( SIZE( Decomp%Head(I)%EndTags ) .EQ. NRuns )
!
! Allocate and permute the array of runs
!
        ALLOCATE( Head(TruePE)%StartTags(NRuns) )
        ALLOCATE( Head(TruePE)%EndTags(NRuns) )
        ALLOCATE( Head(TruePE)%Offsets(NRuns) )
        DO J = 1, NRuns
          Head(TruePE)%StartTags(J) = Decomp%Head(I)%StartTags(J)
          Head(TruePE)%EndTags(J)   = Decomp%Head(I)%EndTags(J)
          Head(TruePE)%Offsets(J)   = Decomp%Head(I)%Offsets(J)
        ENDDO
      ENDDO
!
! Deallocate the arrays of starting and ending tags
!
      DO I = 1, NPEs
        DEALLOCATE( Decomp%Head(I)%StartTags )
        DEALLOCATE( Decomp%Head(I)%EndTags )
        DEALLOCATE( Decomp%Head(I)%Offsets )
      ENDDO
!
! Deallocate the heads to the Lists
!
      DEALLOCATE( Decomp%Head )

!
! Link the new head to that in the decomposition
!
      Decomp%Head => Head
      
      NULLIFY( Head )

      CPP_LEAVE_PROCEDURE( "DECOMPPERMUTE" )
      RETURN
!EOC
      END SUBROUTINE DecompPermute
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompRegular1D --- Create a decomposition for a 1-D grid
!
! !INTERFACE:

      SUBROUTINE DecompRegular1D ( NPEs, Dist, Decomp ) 1
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )            :: NPEs     ! Number of PEs
      INTEGER, INTENT( IN )            :: Dist(:)  ! Distribution in X
!
! !OUTPUT PARAMETERS:
      TYPE(DecompType), INTENT( OUT )  :: Decomp   ! Decomp information
!
! !DESCRIPTION:
!     Creates a variable block decomposition for a regular 1-D grid
!     (this is also known as a "block-general" distribution).  The
!     decomposition is given through the Dist distribution 
!     which contains the number of entries on each PE.  
!
! !SYSTEM ROUTINES:
!     ALLOCATE
!
! !REVISION HISTORY:
!   98.01.19   Sawyer     Creation
!   98.01.22   Sawyer     Corrections, TESTED
!   98.05.11   Sawyer     Removed Perm from arglist -- see DecompPermute
!   00.07.07   Sawyer     Removed use of DimSizes(:) array
!
!EOP
!------------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER  :: I, Counter
!
      CPP_ENTER_PROCEDURE( "DECOMPREGULAR1D" )
!
      CPP_ASSERT_F90( NPEs .EQ. SIZE( Dist ) )
!
! The head contains NPEs pointers to the tag lists.
!
      Decomp%GlobalSize = SUM(Dist)
      ALLOCATE( Decomp%NumEntries( NPEs ) )
      ALLOCATE( Decomp%Head( NPEs ) )
      Counter = 0
      DO I = 1, NPEs
        Decomp%NumEntries(I) = Dist(I)
!
! Since this is a regular distribution there is only one run of tags per PE.
!
        NULLIFY( Decomp%Head(I)%StartTags )
        NULLIFY( Decomp%Head(I)%EndTags )
        NULLIFY( Decomp%Head(I)%Offsets )
        ALLOCATE( Decomp%Head(I)%StartTags(1) )
        ALLOCATE( Decomp%Head(I)%EndTags(1) )
        ALLOCATE( Decomp%Head(I)%Offsets(1) )
!
! The starting and ending tags are immediately determined from
! the decomposition arrays  
!
        Decomp%Head(I)%StartTags(1) = Counter+1
        Counter = Counter + Dist(I)
        Decomp%Head(I)%EndTags(1) = Counter
        Decomp%Head(I)%Offsets(1) = 0   ! Offset in local segment
      ENDDO

      Decomp%Defined = .TRUE.

      CPP_LEAVE_PROCEDURE( "DECOMPREGULAR1D" )
      RETURN
!EOC
      END SUBROUTINE DecompRegular1D
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompRegular2D --- Create a decomposition for a 2-D grid
!
! !INTERFACE:

      SUBROUTINE DecompRegular2D( NPEsX, NPEsY, Xdist, Ydist, Decomp ) 1
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )            :: NPEsX    ! Number of PEs in X
      INTEGER, INTENT( IN )            :: NPEsY    ! Number of PEs in Y
      INTEGER, INTENT( IN )            :: Xdist(:) ! Distribution in X
      INTEGER, INTENT( IN )            :: Ydist(:) ! Distribution in Y
!
! !OUTPUT PARAMETERS:
      TYPE(DecompType), INTENT( OUT )  :: Decomp  ! Decomp information
!
!
! !DESCRIPTION:
!     Creates a variable block-block decomposition for a regular 
!     2-D grid.  The decomposition is given through the Xdist and 
!     Ydist distributions, which contain the number of entries on 
!     each PE in that dimension.  This routine thus defines
!     a rectangular "checkerboard" distribution.
!
! !SYSTEM ROUTINES:
!     ALLOCATE
!
! !REVISION HISTORY:
!   98.01.19   Sawyer     Creation
!   98.01.22   Sawyer     Corrections, TESTED
!   98.05.11   Sawyer     Removed Perm from arglist -- see DecompPermute
!   00.07.07   Sawyer     Removed use of DimSizes(:) array
!
! !BUGS:
!     This routine makes the assumption that the sum of the
!     distribution in each dimension adds up to the total 
!     number of entries in that dimension.  It will cause
!     problems if the actual local arrays are over- or 
!     under-allocated.  For example, if the local array is
!     allocated statically for the maximum size of the
!     array on any processor, problems will occur on those
!     PEs which have less than the maximum.
!
!EOP
!------------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER :: TruePE, I, J, K, Counter1, Counter2, SizeX, SizeY
!
      CPP_ENTER_PROCEDURE( "DECOMPREGULAR2D" )
!
! Some sanity checks
!
      CPP_ASSERT_F90( NPEsX .EQ. SIZE( Xdist ) )
      CPP_ASSERT_F90( NPEsY .EQ. SIZE( Ydist ) )
!
! The head contains NPEs pointers to the tag lists.
!
      SizeX = SUM(Xdist)
      SizeY = SUM(Ydist)
      Decomp%GlobalSize = SizeX * SizeY
      ALLOCATE( Decomp%NumEntries( NPEsX*NPEsY ) )
      ALLOCATE( Decomp%Head( NPEsX*NPEsY ) )
      Counter1 = 0
      DO J = 1, NPEsY
        DO I = 1, NPEsX
!
! WARNING!!!!  The definition of the PE is Row-major ordering
!
          TruePE = ( J-1 ) * NPEsX + I 

!
! The number of entries is the product of the local X, Y, Z allotment
!
          Decomp%NumEntries(TruePE) = Xdist(I)*Ydist(J)
!
! For each Y there is a separate run
!
          NULLIFY( Decomp%Head(TruePE)%StartTags )
          NULLIFY( Decomp%Head(TruePE)%EndTags )
          NULLIFY( Decomp%Head(TruePE)%Offsets )
          ALLOCATE( Decomp%Head(TruePE)%StartTags(Ydist(J)) )
          ALLOCATE( Decomp%Head(TruePE)%EndTags(Ydist(J)) )
          ALLOCATE( Decomp%Head(TruePE)%Offsets(Ydist(J)) )
          Counter2 = Counter1
          DO K = 1, Ydist(J)
!
! Since this is a regular distribution the definition of
! tags is dictated by Xdist(I), and appears Ydist(J) times
!
!
            Decomp%Head(TruePE)%StartTags(K) = Counter2 + 1
            Decomp%Head(TruePE)%EndTags(K) = Counter2 + Xdist(I)
            Counter2 = Counter2 + SizeX
          ENDDO
          Counter1 = Counter1 + Xdist(I)
        ENDDO
!
! Align the counter such that it points to the start of the next 
! block.  (Ydist(J)-1) since already one layer has been added in.
! Implicit assumption that SizeX = SUM( Xdist )
!
        Counter1 = Counter1 + SizeX*(Ydist(J)-1)
      ENDDO
!
! Calculate offsets
!
      DO I=1, NPEsX*NPEsY
        IF ( SIZE(Decomp%Head(I)%StartTags) > 0 ) THEN
          Decomp%Head(I)%Offsets(1) = 0
          DO J=2, SIZE(Decomp%Head(I)%StartTags)
            Decomp%Head(I)%Offsets(J) = Decomp%Head(I)%Offsets(J-1) +      &
            Decomp%Head(I)%EndTags(J-1) - Decomp%Head(I)%StartTags(J-1) + 1
          ENDDO
        ENDIF
      ENDDO

      Decomp%Defined = .TRUE.

      CPP_LEAVE_PROCEDURE( "DECOMPREGULAR2D" )
      RETURN
!EOC
      END SUBROUTINE DecompRegular2D
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompRegular3D --- Create a decomposition for a 3-D grid
!
! !INTERFACE:

      SUBROUTINE DecompRegular3D ( NPEsX, NPEsY, NPEsZ,                  & 1
                                   Xdist, Ydist, Zdist, Decomp )
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )            :: NPEsX    ! Number of PEs in X
      INTEGER, INTENT( IN )            :: NPEsY    ! Number of PEs in Y
      INTEGER, INTENT( IN )            :: NPEsZ    ! Number of PEs in Z
      INTEGER, INTENT( IN )            :: Xdist(:) ! Distribution in X
      INTEGER, INTENT( IN )            :: Ydist(:) ! Distribution in Y
      INTEGER, INTENT( IN )            :: Zdist(:) ! Distribution in Z
!
! !OUTPUT PARAMETERS:
      TYPE(DecompType), INTENT( OUT )  :: Decomp  ! Decomp information
!
!
! !DESCRIPTION:
!     Creates a decomposition for a regular 3-D grid.  The
!     decomposition is given through the Xdist, Ydist, and Zdist
!     distributions, which contain the number of entries on 
!     each PE in that dimension.    This routine thus defines
!     a parallelopiped (SOMA-block) distribution.
!
! !SYSTEM ROUTINES:
!     ALLOCATE
!
! !REVISION HISTORY:
!   98.01.19   Sawyer     Creation
!   98.05.11   Sawyer     Removed Perm from arglist -- see DecompPermute
!   00.07.07   Sawyer     Removed use of Sizes(:) array
!
! !BUGS:
!     This routine makes the assumption that the sum of the
!     distribution in each dimension adds up to the total 
!     number of entries in that dimension.  It will cause
!     problems if the actual local arrays are over- or 
!     under-allocated.  For example, if the local array is
!     allocated statically for the maximum size of the
!     array on any processor, problems will occur on those
!     PEs which have less than the maximum.
!
!EOP
!------------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER  :: TruePE, Counter1, Counter2, Counter3
      INTEGER  :: I, J, K, L, M, N, SizeX, SizeY, SizeZ
!
      CPP_ENTER_PROCEDURE( "DECOMPREGULAR3D" )
!
! Some sanity checks
!
!
      CPP_ASSERT_F90( NPEsX .EQ. SIZE( Xdist ) )
      CPP_ASSERT_F90( NPEsY .EQ. SIZE( Ydist ) )
      CPP_ASSERT_F90( NPEsZ .EQ. SIZE( Zdist ) )
      CPP_ASSERT_F90( .NOT. ASSOCIATED( Decomp%Head ) )
!
! The head contains NPEs pointers to the tag lists.
!
      SizeX = SUM(Xdist)
      SizeY = SUM(Ydist)
      SizeZ = SUM(Zdist)
      Decomp%GlobalSize = SizeX * SizeY * SizeZ
      ALLOCATE( Decomp%NumEntries( NPEsX*NPEsY*NPEsZ ) )
      ALLOCATE( Decomp%Head( NPEsX*NPEsY*NPEsZ ) )
      Counter1 = 0
      DO K = 1, NPEsZ
        DO J = 1, NPEsY
          DO I = 1, NPEsX
!
! WARNING!!!!  The definition of the PE is Row-major ordering
!
            TruePE = (K-1)*NPEsX*NPEsY + (J-1)*NPEsX + I 
            NULLIFY( Decomp%Head(TruePE)%StartTags )
            NULLIFY( Decomp%Head(TruePE)%EndTags )
            NULLIFY( Decomp%Head(TruePE)%Offsets )
!
! The number of entries is the product of the local X, Y, Z allotment
!
            Decomp%NumEntries(TruePE) = Xdist(I)*Ydist(J)*Zdist(K)
!
! For each Z there are Y separate runs
!
            ALLOCATE( Decomp%Head(TruePE)%StartTags(Ydist(J)*Zdist(K)) )
            ALLOCATE( Decomp%Head(TruePE)%EndTags(Ydist(J)*Zdist(K)) )
            ALLOCATE( Decomp%Head(TruePE)%Offsets(Ydist(J)*Zdist(K)) )
            Counter2 = Counter1
            L = 0
            DO N = 1, Zdist(K)
              Counter3 = Counter2
              DO M = 1, Ydist(J)
!
!     Since this is a regular distribution the definition of
!     tags is dictated by Xdist(I), and appears Ydist(J) times
!
!
                L = L + 1
                Decomp%Head(TruePE)%StartTags(L) = Counter3 + 1
                Decomp%Head(TruePE)%EndTags(L) = Counter3 + Xdist(I)
                Counter3 = Counter3 + SizeX
              ENDDO
              Counter2 = Counter2 + SizeX*SizeY
            ENDDO
            Counter1 = Counter1 + Xdist(I)
          ENDDO
!
! Align the counter such that it points to the start of the next 
! block.  (Ydist(J)-1) since already one X layer has been added in.
! Implicit assumption that SizeX = SUM( Xdist )
!
          Counter1 = Counter1 + SizeX*(Ydist(J)-1)
        ENDDO
!
! Align the counter such that it points to the start of the next 
! block.  (Zdist(K)-1) since already one X-Y layer has been added in.
! Implicit assumption that SizeY = SUM( Ydist )
!
        Counter1 = Counter1 + SizeX*SizeY*(Zdist(K)-1)
      ENDDO
!
! Calculate offsets
!
      DO I=1, NPEsX*NPEsY*NPEsZ
        IF ( SIZE(Decomp%Head(I)%StartTags) > 0 ) THEN
          Decomp%Head(I)%Offsets(1) = 0
          DO J=2, SIZE(Decomp%Head(I)%StartTags)
            Decomp%Head(I)%Offsets(J) = Decomp%Head(I)%Offsets(J-1) +      &
            Decomp%Head(I)%EndTags(J-1) - Decomp%Head(I)%StartTags(J-1) + 1
          ENDDO
        ENDIF
      ENDDO

      Decomp%Defined = .TRUE.

      CPP_LEAVE_PROCEDURE( "DECOMPREGULAR3D" )
      RETURN
!EOC
      END SUBROUTINE DecompRegular3D
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompRegular3Dorder --- Create a decomposition for a 3-D grid
!
! !INTERFACE:

      SUBROUTINE DecompRegular3Dorder( Order, NPEsX, NPEsY, NPEsZ,       & 1
                                       Xdist, Ydist, Zdist, Decomp )
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      CHARACTER(3), INTENT( IN )       :: Order    ! Dim. ordering
      INTEGER, INTENT( IN )            :: NPEsX    ! Number of PEs in X
      INTEGER, INTENT( IN )            :: NPEsY    ! Number of PEs in Y
      INTEGER, INTENT( IN )            :: NPEsZ    ! Number of PEs in Z
      INTEGER, INTENT( IN )            :: Xdist(:) ! Distribution in X
      INTEGER, INTENT( IN )            :: Ydist(:) ! Distribution in Y
      INTEGER, INTENT( IN )            :: Zdist(:) ! Distribution in Z
!
! !OUTPUT PARAMETERS:
      TYPE(DecompType), INTENT( OUT )  :: Decomp  ! Decomp information
!
! !DESCRIPTION:
!     Creates a variable block-block-block decomposition for a regular 
!     3-D grid, where the ordering of the PEs can be explicitly given
!     (see next paragraph). The decomposition is given through the 
!     Xdist, Ydist, and Zdist distributions, which contain the number 
!     of entries on each PE in that dimension.  This routine thus defines
!     a parallelopiped (SOMA-block) distribution.
!
!     With the string argument Order, the order of counting in the
!     3d PE space can be specified.  There are six possible values:
!     "xyz", "xzy", "yxz", "yzx", "zxy", and "zyx".  
!
!     The same as DecompRegular3Dorder could also be achieved by
!     using DecompRegular3D and then permuting the PE ownership
!     with DecompPermute.
!
! !SYSTEM ROUTINES:
!     ALLOCATE
!
! !REVISION HISTORY:
!   01.03.20   Sawyer     Creation from DecompRegular3Dzy, added ordering
!
! !BUGS:
!   Not yet tested
!
!EOP
!---------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER  :: TruePE, Counter1, Counter2, Counter3
      INTEGER  :: I, J, K, L, M, N, SizeX, SizeY, SizeZ
      INTEGER  :: Imult, Jmult, Kmult
!
      CPP_ENTER_PROCEDURE( "DECOMPREGULAR3DORDER" )
!
! Some sanity checks
!
!
      CPP_ASSERT_F90( NPEsX .EQ. SIZE( Xdist ) )
      CPP_ASSERT_F90( NPEsY .EQ. SIZE( Ydist ) )
      CPP_ASSERT_F90( NPEsZ .EQ. SIZE( Zdist ) )
      CPP_ASSERT_F90( .NOT. ASSOCIATED( Decomp%Head ) )

      IF ( Order=="xyz" ) THEN
! Looks like:      TruePE = (K-1)*NPEsX*NPEsY + (J-1)*NPEsX + (I-1) + 1
        Imult = 1
        Jmult = NPEsX
        Kmult = NPEsX*NPEsY
      ELSE IF ( Order=="xzy" ) THEN
! Looks like:      TruePE = (J-1)*NPEsX*NPEsZ + (K-1)*NPEsX + (I-1) + 1
        Imult = 1
        Jmult = NPEsX*NPEsZ
        Kmult = NPEsX
      ELSE IF ( Order=="yxz" ) THEN
! Looks like:      TruePE = (K-1)*NPEsY*NPEsX + (I-1)*NPEsY + (J-1) + 1
        Imult = NPEsY
        Jmult = 1
        Kmult = NPEsX*NPEsY
      ELSE IF ( Order=="yzx" ) THEN
! Looks like:      TruePE = (I-1)*NPEsY*NPEsZ + (K-1)*NPEsY + (J-1) + 1
        Imult = NPEsY*NPEsZ
        Jmult = 1
        Kmult = NPEsY
      ELSE IF ( Order=="zxy" ) THEN
! Looks like:      TruePE = (J-1)*NPEsX*NPEsZ + (I-1)*NPEsZ + (K-1) + 1
        Imult = NPEsZ
        Jmult = NPEsX*NPEsZ
        Kmult = 1
      ELSE IF ( Order=="zyx" ) THEN
! Looks like:      TruePE = (I-1)*NPEsY*NPEsZ + (J-1)*NPEsZ + (K-1) + 1
        Imult = NPEsY*NPEsZ
        Jmult = NPEsZ
        Kmult = 1
      ELSE 
! Looks like:      TruePE = (K-1)*NPEsX*NPEsY + (J-1)*NPEsX + (I-1) + 1
        write(iulog,*) "Warning: DecompCreate3Dorder", Order, "not supported"
        write(iulog,*) "         Continuing with XYZ ordering"
        Imult = 1
        Jmult = NPEsX
        Kmult = NPEsX*NPEsY
      ENDIF

!
! The head contains NPEs pointers to the tag lists.
!
      SizeX = SUM(Xdist)
      SizeY = SUM(Ydist)
      SizeZ = SUM(Zdist)
      Decomp%GlobalSize = SizeX * SizeY * SizeZ
      ALLOCATE( Decomp%NumEntries( NPEsX*NPEsY*NPEsZ ) )
      ALLOCATE( Decomp%Head( NPEsX*NPEsY*NPEsZ ) )
      Counter1 = 0

      DO K = 1, NPEsZ
        DO J = 1, NPEsY
          DO I = 1, NPEsX
!
! WARNING!!!!  The definition of the PE is Row-major ordering
!
            
            TruePE = (I-1)*Imult + (J-1)*Jmult + (K-1)*Kmult + 1 
!
! The number of entries is the product of the local X, Y, Z allotment
!
            Decomp%NumEntries(TruePE) = Xdist(I)*Ydist(J)*Zdist(K)
!
! For each Z there are Y separate runs
!
            ALLOCATE( Decomp%Head(TruePE)%StartTags(Ydist(J)*Zdist(K)) )
            ALLOCATE( Decomp%Head(TruePE)%EndTags(Ydist(J)*Zdist(K)) )
            ALLOCATE( Decomp%Head(TruePE)%Offsets(Ydist(J)*Zdist(K)) )
            Counter2 = Counter1
            L = 0
            DO N = 1, Zdist(K)
              Counter3 = Counter2
              DO M = 1, Ydist(J)
!
!     Since this is a regular distribution the definition of
!     tags is dictated by Xdist(I), and appears Ydist(J) times
!
!
                L = L + 1
                Decomp%Head(TruePE)%StartTags(L) = Counter3 + 1
                Decomp%Head(TruePE)%EndTags(L) = Counter3 + Xdist(I)
                Counter3 = Counter3 + SizeX
              ENDDO
              Counter2 = Counter2 + SizeX*SizeY
            ENDDO
            Counter1 = Counter1 + Xdist(I)
          ENDDO
!
! Align the counter such that it points to the start of the next 
! block.  (Ydist(J)-1) since already one X layer has been added in.
! Implicit assumption that SizeX = SUM( Xdist )
!
          Counter1 = Counter1 + SizeX*(Ydist(J)-1)
        ENDDO
!
! Align the counter such that it points to the start of the next 
! block.  (Zdist(K)-1) since already one X-Y layer has been added in.
! Implicit assumption that SizeY = SUM( Ydist )
!
        Counter1 = Counter1 + SizeX*SizeY*(Zdist(K)-1)
      ENDDO
!
! Calculate offsets
!
      DO I=1, NPEsX*NPEsY*NPEsZ
        IF ( SIZE(Decomp%Head(I)%StartTags) > 0 ) THEN
          Decomp%Head(I)%Offsets(1) = 0
          DO J=2, SIZE(Decomp%Head(I)%StartTags)
            Decomp%Head(I)%Offsets(J) = Decomp%Head(I)%Offsets(J-1) +      &
            Decomp%Head(I)%EndTags(J-1) - Decomp%Head(I)%StartTags(J-1) + 1
          ENDDO
        ENDIF
      ENDDO

      Decomp%Defined = .TRUE.

      CPP_LEAVE_PROCEDURE( "DECOMPREGULAR3DORDER" )
      RETURN
!EOC
      END SUBROUTINE DecompRegular3DOrder
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompRegular4D --- Create a decomposition for a 3-D grid
!
! !INTERFACE:

      SUBROUTINE DecompRegular4D ( NPEsX, NPEsY, NPEsZ, NPEsT,          & 1
                                   Xdist, Ydist, Zdist, Tdist, Decomp )
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )            :: NPEsX    ! Number of PEs in X
      INTEGER, INTENT( IN )            :: NPEsY    ! Number of PEs in Y
      INTEGER, INTENT( IN )            :: NPEsZ    ! Number of PEs in Z
      INTEGER, INTENT( IN )            :: NPEsT    ! Number of PEs in T
      INTEGER, INTENT( IN )            :: Xdist(:) ! Distribution in X
      INTEGER, INTENT( IN )            :: Ydist(:) ! Distribution in Y
      INTEGER, INTENT( IN )            :: Zdist(:) ! Distribution in Z
      INTEGER, INTENT( IN )            :: Tdist(:) ! Distribution in T
!
! !OUTPUT PARAMETERS:
      TYPE(DecompType), INTENT( OUT )  :: Decomp  ! Decomp information
!
!
! !DESCRIPTION:
!     Creates a decomposition for a regular 4-D grid.  The
!     decomposition is given through the Xdist, Ydist, Zdist, Tdist
!     distributions, which contain the number of entries on 
!     each PE in that dimension.    This routine thus defines
!     a parallelopiped (SOMA-block) distribution.
!
! !SYSTEM ROUTINES:
!     ALLOCATE
!
! !REVISION HISTORY:
!   02.12.23   Sawyer     Creation from DecompRegular4D
!
! !BUGS:
!     This routine makes the assumption that the sum of the
!     distribution in each dimension adds up to the total 
!     number of entries in that dimension.  It will cause
!     problems if the actual local arrays are over- or 
!     under-allocated.  For example, if the local array is
!     allocated statically for the maximum size of the
!     array on any processor, problems will occur on those
!     PEs which have less than the maximum.
!
!EOP
!------------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER  :: TruePE, Counter1, Counter2, Counter3, Counter4
      INTEGER  :: I, J, K, L, M, N, P, T, SizeX, SizeY, SizeZ, SizeT
!
      CPP_ENTER_PROCEDURE( "DECOMPREGULAR4D" )
!
! Some sanity checks
!
!
      CPP_ASSERT_F90( NPEsX .EQ. SIZE( Xdist ) )
      CPP_ASSERT_F90( NPEsY .EQ. SIZE( Ydist ) )
      CPP_ASSERT_F90( NPEsZ .EQ. SIZE( Zdist ) )
      CPP_ASSERT_F90( NPEsT .EQ. SIZE( Tdist ) )
      CPP_ASSERT_F90( .NOT. ASSOCIATED( Decomp%Head ) )
!
! The head contains NPEs pointers to the tag lists.
!
      SizeX = SUM(Xdist)
      SizeY = SUM(Ydist)
      SizeZ = SUM(Zdist)
      SizeT = SUM(Tdist)
      Decomp%GlobalSize = SizeX * SizeY * SizeZ * SizeT
      ALLOCATE( Decomp%NumEntries( NPEsX*NPEsY*NPEsZ*NPEsT ) )
      ALLOCATE( Decomp%Head( NPEsX*NPEsY*NPEsZ*NPEsT ) )
      Counter1 = 0
      DO T = 1, NPEsT
        DO K = 1, NPEsZ
          DO J = 1, NPEsY
            DO I = 1, NPEsX
!
! WARNING!!!!  The definition of the PE is Row-major ordering
!
              TruePE = (T-1)*NPEsX*NPEsY*NPEsZ +                      &
                       (K-1)*NPEsX*NPEsY + (J-1)*NPEsX + I 
              NULLIFY( Decomp%Head(TruePE)%StartTags )
              NULLIFY( Decomp%Head(TruePE)%EndTags )
              NULLIFY( Decomp%Head(TruePE)%Offsets )
!
! The number of entries is the product of the local X, Y, Z allotment
!
              Decomp%NumEntries(TruePE) =                             &
                     Xdist(I)*Ydist(J)*Zdist(K)*Tdist(T)
!
! For each Z there are Y separate runs
!
              ALLOCATE( Decomp%Head(TruePE)%StartTags(Ydist(J)*Zdist(K)*Tdist(T)) )
              ALLOCATE( Decomp%Head(TruePE)%EndTags(Ydist(J)*Zdist(K)*Tdist(T)) )
              ALLOCATE( Decomp%Head(TruePE)%Offsets(Ydist(J)*Zdist(K)*Tdist(T)) )
              Counter2 = Counter1     ! Base address
              L = 0
              DO P = 1, Tdist(T)
                Counter3 = Counter2
                DO N = 1, Zdist(K)
                  Counter4 = Counter3
                  DO M = 1, Ydist(J)
!
!     Since this is a regular distribution the definition of
!     tags is dictated by Xdist(I), and appears Ydist(J) times
!
!
                    L = L + 1
                    Decomp%Head(TruePE)%StartTags(L) = Counter4 + 1
                    Decomp%Head(TruePE)%EndTags(L) = Counter4 + Xdist(I)
                    Counter4 = Counter4 + SizeX
                  ENDDO
                  Counter3 = Counter3 + SizeX*SizeY
                ENDDO
                Counter2 = Counter2 + SizeX*SizeY*SizeZ
              ENDDO
              Counter1 = Counter1 + Xdist(I)   ! Increment base address
            ENDDO
!
! Align the counter such that it points to the start of the next 
! block.  (Ydist(J)-1) since already one X layer has been added in.
! Implicit assumption that SizeX = SUM( Xdist )
!
            Counter1 = Counter1 + SizeX*(Ydist(J)-1)
          ENDDO
!
! Align the counter such that it points to the start of the next 
! block.  (Zdist(K)-1) since already one X-Y layer has been added in.
! Implicit assumption that SizeY = SUM( Ydist )
!
          Counter1 = Counter1 + SizeX*SizeY*(Zdist(K)-1)
        ENDDO
        Counter1 = Counter1 + SizeX*SizeY*SizeZ*(Tdist(T)-1)
      ENDDO
!
! Calculate offsets
!
      DO I=1, NPEsX*NPEsY*NPEsZ*NPEsT
        IF ( SIZE(Decomp%Head(I)%StartTags) > 0 ) THEN
          Decomp%Head(I)%Offsets(1) = 0
          DO J=2, SIZE(Decomp%Head(I)%StartTags)
            Decomp%Head(I)%Offsets(J) = Decomp%Head(I)%Offsets(J-1) +      &
            Decomp%Head(I)%EndTags(J-1) - Decomp%Head(I)%StartTags(J-1) + 1
          ENDDO
        ENDIF
      ENDDO

      Decomp%Defined = .TRUE.

      CPP_LEAVE_PROCEDURE( "DECOMPREGULAR4D" )
      RETURN
!EOC
      END SUBROUTINE DecompRegular4D
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompCreateIrr --- Decomposition for an irregular mesh
!
! !INTERFACE:

      SUBROUTINE DecompCreateIrr( NPEs, Pe, TotalPts, Decomp ) 1
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )            :: NPEs     ! Number of PEs
      INTEGER, INTENT( IN )            :: Pe(:)    ! Processor location
      INTEGER, INTENT( IN )            :: TotalPts ! Number of points
!
! !OUTPUT PARAMETERS:
      TYPE(DecompType), INTENT( OUT )  :: Decomp  ! Decomp information
!
!
! !DESCRIPTION:
!     Creates a decomposition for a irregular 1-D mesh.  The
!     decomposition is given through the number of points and
!     an array containing the PE which each point is mapped to.
!     This mapping naturally assumes that the local numbering
!     is incrementally increasing as points are mapped to PEs.
!     This assumption is sufficient for most applications, but
!     another irregular mapping routine is available for more
!     complex cases.
!
! !SYSTEM ROUTINES:
!     ALLOCATE
!
! !REVISION HISTORY:
!   98.01.19   Sawyer     Creation, with requirements from Jay Larson
!   98.11.02   Sawyer     Rewritten to requirements for Andrea Molod
!   00.07.07   Sawyer     Removed use of DimSizes(:) array
!   00.11.12   Sawyer     Changed argument order for overloading
!
!EOP
!------------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER  :: I, J, PEhold
      INTEGER  :: Counter( NPEs )
!
      CPP_ENTER_PROCEDURE( "DECOMPCREATEIRR" )
!
      CPP_ASSERT_F90( TotalPts .LE. SIZE( PE ) )
      CPP_ASSERT_F90( .NOT. ASSOCIATED( Decomp%Head ) )

!
! The head contains NPEs pointers to the tag lists.
!
      Decomp%GlobalSize = TotalPts
      ALLOCATE( Decomp%NumEntries( NPEs ) )
      ALLOCATE( Decomp%Head( NPEs ) )
!
! Perform over all points in the mapping
!
      PEhold= -1
      Counter = 0
      Decomp%NumEntries = 0
      DO I=1, TotalPts
        CPP_ASSERT_F90( ( PE( I ) .LT. NPEs .AND. PE( I ) .GE. 0 ) )
        IF ( PE( I ) .NE. PEhold ) THEN
          PEhold = PE( I )
          Counter( PEhold+1 ) = Counter( PEhold+1 ) + 1
        ENDIF
        Decomp%NumEntries(PEHold+1) = Decomp%NumEntries(PEHold+1) + 1
      ENDDO
      DO I=1, NPEs
!
! Now the amount of space to allocate is known.  It is acceptable
! to in allocated an array of size 0 (F90 Handbook, Section 6.5.1)
!
        ALLOCATE( Decomp%Head(I)%StartTags(Counter(I)) )
        ALLOCATE( Decomp%Head(I)%EndTags(Counter(I)) )
        ALLOCATE( Decomp%Head(I)%Offsets(Counter(I)) )
      ENDDO
!
! Perform over all points in the mapping
!
      PEhold= -1
      Counter = 0
      DO I=1, TotalPts
        IF ( PE( I ) .NE. PEhold ) THEN
!
! If not first entry, close up shop on previous run
!
          IF ( I .GT. 1 ) THEN
            Decomp%Head(PEhold+1)%EndTags(Counter(PEhold+1)) = I-1
          ENDIF
          PEhold = PE( I )
          Counter( PEhold+1 ) = Counter( PEhold+1 ) + 1
          Decomp%Head(PEhold+1)%StartTags(Counter(PEhold+1)) = I
        ENDIF
      ENDDO
!
! Clean up shop for the final run
!
      IF ( TotalPts .GT. 0 ) THEN
        Decomp%Head(PEhold+1)%EndTags(Counter(PEhold+1)) = TotalPts
      ENDIF

!
! Calculate offsets
!
      DO I=1, NPEs
        IF ( Counter(I) > 0 ) THEN
          Decomp%Head(I)%Offsets(1) = 0
          DO J=2, Counter(I)
            Decomp%Head(I)%Offsets(J) = Decomp%Head(I)%Offsets(J-1) +    &
            Decomp%Head(I)%EndTags(J-1) - Decomp%Head(I)%StartTags(J-1) + 1
          ENDDO
        ENDIF
      ENDDO
      Decomp%Defined = .TRUE.

      CPP_LEAVE_PROCEDURE( "DECOMPCREATEIRR" )
      RETURN
!EOC
      END SUBROUTINE DecompCreateIrr
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompCreateTags --- Decomposition from Pe and Tags
!
! !INTERFACE:

      SUBROUTINE DecompCreateTags(Npes, Pe, TotalPts, Tags, Decomp ) 1
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )            :: NPEs     ! Number of PEs
      INTEGER, INTENT( IN )            :: Pe(:)    ! Processor location
      INTEGER, INTENT( IN )            :: TotalPts ! Number of points
      INTEGER, INTENT( IN )            :: Tags(:)  ! Global index
!
! !OUTPUT PARAMETERS:
      TYPE(DecompType), INTENT( OUT )  :: Decomp   ! Decomp information
!
!
! !DESCRIPTION:
!     Creates a decomposition for a irregular mesh from the 
!     Pe ownership and the Tags.  This is a simple extension of 
!     DecompCreateIrr (previously DecompIrregular1D) but is
!     much more dangerous, since the user can define the Tags
!     (global indices) arbitrarily.
!
! !SYSTEM ROUTINES:
!     ALLOCATE
!
! !REVISION HISTORY:
!   00.11.12   Sawyer     Creation from DecompCreateIrr
!
!EOP
!------------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER  :: I, J, PEhold, LastTag
      INTEGER  :: Counter( NPEs )
!
      CPP_ENTER_PROCEDURE( "DECOMPCREATETAGS" )
!
      CPP_ASSERT_F90( TotalPts .LE. SIZE( PE ) )
      CPP_ASSERT_F90( TotalPts .LE. SIZE( Tags ) )
      CPP_ASSERT_F90( .NOT. ASSOCIATED( Decomp%Head ) )

!
! The head contains NPEs pointers to the tag lists.
!
      Decomp%GlobalSize = TotalPts
      ALLOCATE( Decomp%NumEntries( NPEs ) )
      ALLOCATE( Decomp%Head( NPEs ) )
!
! Perform over all points in the mapping
!
      PEhold  = -1
      LastTag = -999999999
      Counter = 0
      Decomp%NumEntries = 0
      DO I=1, TotalPts
        CPP_ASSERT_F90( PE( I ) .LT. NPEs .AND. PE( I ) .GE. 0 )
        IF ( LastTag==0 .OR. Tags(I)/=LastTag+1 .OR. PE(I)/=PEhold ) THEN
          PEhold = PE( I )
          Counter( PEhold+1 ) = Counter( PEhold+1 ) + 1
        ENDIF
        Decomp%NumEntries(PEHold+1) = Decomp%NumEntries(PEHold+1) + 1
        LastTag = Tags(I)
      ENDDO

      DO I=1, NPEs
!
! Now the amount of space to allocate is known.  It is acceptable
! to in allocated an array of size 0 (F90 Handbook, Section 6.5.1)
!
        ALLOCATE( Decomp%Head(I)%StartTags(Counter(I)) )
        ALLOCATE( Decomp%Head(I)%EndTags(Counter(I)) )
        ALLOCATE( Decomp%Head(I)%Offsets(Counter(I)) )
      ENDDO

!
! Perform over all points in the domain
!
      PEhold  = -1
      LastTag = -999999999
      Counter = 0
      DO I=1, TotalPts
        IF ( LastTag==0 .OR. Tags(I)/=LastTag+1 .OR. PE(I)/=PEhold ) THEN
!
! If not first entry, close up shop on previous run
!
          IF ( I .GT. 1 ) THEN
            Decomp%Head(PEhold+1)%EndTags(Counter(PEhold+1)) = LastTag
          ENDIF
          PEhold = PE( I )
          Counter( PEhold+1 ) = Counter( PEhold+1 ) + 1
          Decomp%Head(PEhold+1)%StartTags(Counter(PEhold+1)) = Tags(I)
        ENDIF
        LastTag = Tags(I)
      ENDDO
!
! Clean up shop for the final run
!
      IF ( TotalPts .GT. 0 ) THEN
        Decomp%Head(PEhold+1)%EndTags(Counter(PEhold+1)) =Tags(TotalPts)
      ENDIF

!
! Calculate offsets
!
      DO I=1, NPEs
        IF ( Counter(I) > 0 ) THEN
          Decomp%Head(I)%Offsets(1) = 0
          DO J=2, Counter(I)
            Decomp%Head(I)%Offsets(J) = Decomp%Head(I)%Offsets(J-1) +      &
            Decomp%Head(I)%EndTags(J-1) - Decomp%Head(I)%StartTags(J-1) + 1
          ENDDO
        ENDIF
      ENDDO
      Decomp%Defined = .TRUE.

      CPP_LEAVE_PROCEDURE( "DECOMPCREATETAGS" )
      RETURN
!EOC
      END SUBROUTINE DecompCreateTags
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompG2L --- Map global index to local and PE
!
! !INTERFACE:

      SUBROUTINE DecompG2L ( Decomp, Global, Local, Pe ) 1
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      TYPE(DecompType), INTENT( IN )   :: Decomp  ! Decomp information
      INTEGER, INTENT( IN )            :: Global  ! Global index
!
! !OUTPUT PARAMETERS:

      INTEGER, INTENT( OUT )  :: Local            ! Local index
      INTEGER, INTENT( OUT )  :: Pe               ! Pe location
!
!
! !DESCRIPTION:
!     Given a decomposition and a global index, this routine returns
!     the local index and PE location of that global tag.  If the
!     global index is not found, Local = 0, Pe = -1 is returned.
!
!     Note that this routine is not efficient by any stretch of the 
!     imagination --- only one index can be converted at a time.
!     In addition, a search procedure must be performed, whose 
!     efficiency is inversely proportional to the size of the decomposition
!     (in particular, to the number of "runs").  Conceptually this
!     mapping should be used only once in the program for
!     initialization, and subsequently all calculations should take
!     place using local indices.
!
! !SYSTEM ROUTINES:
!     SIZE
!
! !REVISION HISTORY:
!   98.03.20   Sawyer     Creation
!   01.03.17   Sawyer     Test for Global==0 (undefined element)
!   02.11.22   Sawyer     Optimized by caching previously used block
!EOP
!------------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER, SAVE  :: Ipe   =  0   ! Initial process ID
      INTEGER, SAVE  :: J     =  0   ! Initial DO loop value
      INTEGER        :: Ipeold, Jold, PEsize, Jsize
!
      CPP_ENTER_PROCEDURE( "DECOMPG2L" )

!
! Search over all the PEs
!
      Pe    = -1
      Local = 0
      IF ( Global == 0 ) RETURN          ! quick return
      PEsize = SIZE( Decomp%Head )
      IF ( Ipe >= PEsize ) Ipe = 0
      Ipeold= Ipe
PEs:  DO                                 ! Loop over all PEs starting 
        Jsize = SIZE( Decomp%Head(Ipe+1)%StartTags )
        IF ( J >= Jsize ) J = 0
        Jold = J                         ! from the PE used previously
Blocks: DO WHILE (Jsize > 0)             ! Loop through data segments
          IF ( Global >= Decomp%Head(Ipe+1)%StartTags(J+1) .AND.          &
               Global <= Decomp%Head(Ipe+1)%EndTags(J+1) ) THEN
            Local = Decomp%Head(Ipe+1)%Offsets(J+1) + Global -            &
                    Decomp%Head(Ipe+1)%StartTags(J+1) + 1
            Pe = Ipe
            EXIT PEs                     ! Global tag has been found
          ELSE
            J = MOD(J+1,Jsize)           ! Increment the block index
          ENDIF
          IF ( J == Jold ) EXIT Blocks   ! Global tag not on this PE
        ENDDO Blocks
        Ipe = MOD(Ipe+1,PEsize)          ! Increment the pe number
        J   = 0
        IF ( Ipe == Ipeold ) EXIT PEs    ! Global tag not found on any PE
      ENDDO PEs

      CPP_ASSERT_F90( Local .LE. Decomp%NumEntries(Pe+1) ) 

      CPP_LEAVE_PROCEDURE( "DECOMPG2L" )
      RETURN
!
!EOC
      END SUBROUTINE DecompG2L
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompG2LVector --- Map global index to local and PE
!
! !INTERFACE:

      SUBROUTINE DecompG2LVector ( Decomp, N, Global, Local, Pe ) 1
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      TYPE(DecompType), INTENT( IN ):: Decomp     ! Decomp information
      INTEGER, INTENT( IN )         :: N          ! Number of indices
      INTEGER, INTENT( IN )         :: Global(:)  ! Global index
!
! !OUTPUT PARAMETERS:

      INTEGER, INTENT( OUT )  :: Local(:)         ! Local index
      INTEGER, INTENT( OUT )  :: Pe(:)            ! Pe location
!
!
! !DESCRIPTION:
!     Given a decomposition and a global index, this routine returns
!     the local index and PE location of that global tag.  If the
!     global index is not found, Local = 0, Pe = -1 is returned.
!
!     Note that this routine is not efficient by any stretch of the 
!     imagination --- only one index can be converted at a time.
!     In addition, a search procedure must be performed, whose 
!     efficiency is inversely proportional to the size of the decomposition
!     (in particular, to the number of "runs").  Conceptually this
!     mapping should be used only once in the program for
!     initialization, and subsequently all calculations should take
!     place using local indices.
!
! !SYSTEM ROUTINES:
!     SIZE
!
! !REVISION HISTORY:
!   02.11.09   Sawyer     Creation from decompglobaltolocal
!   02.11.22   Sawyer     Optimized by caching previously used block
!
!EOP
!------------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER, SAVE :: J    = 0   ! Initial value
      INTEGER, SAVE :: Ipe  = 0   ! Initial value
      INTEGER  :: I, Ipeold, Jold, PEsize, Jsize
!
      CPP_ENTER_PROCEDURE( "DECOMPG2LVECTOR" )

      PEsize = SIZE( Decomp%Head )
!
! Search over all the PEs
!
      DO I=1, N
        Pe(I) = -1
        Local(I) = 0
        IF ( Global(I) == 0 ) CYCLE
        IF ( Ipe >= PEsize ) Ipe = 0
        Ipeold= Ipe
PEs:    DO WHILE ( PEsize > 0 )            ! Loop over all PEs starting 
          Jsize = SIZE( Decomp%Head(Ipe+1)%StartTags )
          IF ( J >= Jsize ) J = 0
          Jold = J                         ! from the PE used previously
Blocks: DO WHILE (Jsize > 0)               ! Loop through data segments
            IF ( Global(I) >= Decomp%Head(Ipe+1)%StartTags(J+1) .AND.        &
               Global(I) <= Decomp%Head(Ipe+1)%EndTags(J+1) ) THEN
              Local(I) = Decomp%Head(Ipe+1)%Offsets(J+1) + Global(I) -       &
                         Decomp%Head(Ipe+1)%StartTags(J+1) + 1
              Pe(I) = Ipe
              EXIT PEs                     ! Global tag has been found
            ELSE
              J = MOD(J+1,Jsize)           ! Increment the block index
            ENDIF
            IF ( J == Jold ) EXIT Blocks   ! Global tag not on this PE
          ENDDO Blocks
          Ipe = MOD(Ipe+1,PEsize)          ! Increment the pe number
          J   = 0
          IF ( Ipe == Ipeold ) EXIT PEs    ! Global tag not found on any PE
        ENDDO PEs

      CPP_ASSERT_F90( Local(I) .LE. Decomp%NumEntries(Pe(I)+1) ) 

      ENDDO
      CPP_LEAVE_PROCEDURE( "DECOMPG2LVECTOR" )
      RETURN
!
!EOC
    END SUBROUTINE DecompG2LVector
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompL2G --- Map global index to local and PE
!
! !INTERFACE:

      SUBROUTINE DecompL2G ( Decomp, Local, Pe, Global ) 1
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      TYPE(DecompType), INTENT( IN )   :: Decomp  ! Decomp information
      INTEGER, INTENT( IN )            :: Local   ! Local index
      INTEGER, INTENT( IN )            :: Pe      ! Pe location
!
! !OUTPUT PARAMETERS:
      INTEGER, INTENT( OUT )           :: Global  ! Global index
!
!
! !DESCRIPTION:
!     Given a decomposition and a local-pe index pair, this routine 
!     returns  the 2-D global index. If the local index is not found, 
!     0 is returned. 
!
!     Note that this routine is not efficient by any stretch of the 
!     imagination --- only one index can be converted at a time.
!     In addition, a search procedure must be performed, whose 
!     efficiency is inversely proportional to the size of the 
!     decomposition (in particular, to the number of "runs").  
!     Conceptually this mapping should be used only once in the 
!     program for initialization, and subsequently all calculations 
!     should take place using local indices.
!
! !SYSTEM ROUTINES:
!     SIZE
!
! !REVISION HISTORY:
!   98.03.20   Sawyer     Creation
!
!EOP
!------------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER  :: J, Counter
      LOGICAL  :: Found
!
      CPP_ENTER_PROCEDURE( "DECOMPL2G" )
      CPP_ASSERT_F90( Pe .GE. 0 )
      CPP_ASSERT_F90( Pe .LT. SIZE(Decomp%Head) )
      CPP_ASSERT_F90( Local .GT. 0 )
      CPP_ASSERT_F90( Local .LE. Decomp%NumEntries(Pe+1) )

      Counter = 0
      Found   = .FALSE.
      J = 0
      DO WHILE ( .NOT. Found )
        J = J+1
        Counter = Counter + Decomp%Head(Pe+1)%EndTags(J) -               &
                            Decomp%Head(Pe+1)%StartTags(J) + 1
        IF ( Local .LE.  Counter ) THEN
          Found = .TRUE.
!
! The following calculation is not immediately obvious.  Think about it
!
          Global = Local - Counter + Decomp%Head(Pe+1)%EndTags(J)
          Found = .TRUE.
        ELSEIF ( J .GE. SIZE( Decomp%Head(Pe+1)%StartTags ) ) THEN
!
! Emergency brake
!
          Found = .TRUE.
          Global = 0
        ENDIF
      ENDDO

      CPP_LEAVE_PROCEDURE( "DECOMPL2G" )
      RETURN
!
!EOC
      END SUBROUTINE DecompL2G
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompL2GVector --- Map global index to local and PE
!
! !INTERFACE:

      SUBROUTINE DecompL2GVector ( Decomp, N, Local, Pe, Global ) 1
! !USES:
      IMPLICIT NONE
!
! !INPUT PARAMETERS:
      TYPE(DecompType), INTENT( IN )   :: Decomp  ! Decomp information
      INTEGER, INTENT( IN )            :: N       ! Number of indices
      INTEGER, INTENT( IN )            :: Local(:)! Local index
      INTEGER, INTENT( IN )            :: Pe(:)   ! Pe location
!
! !OUTPUT PARAMETERS:
      INTEGER, INTENT( OUT )           :: Global(:) ! Global index
!
!
! !DESCRIPTION:
!     Given a decomposition and a local-pe index pair, this routine 
!     returns  the 2-D global index. If the local index is not found, 
!     0 is returned. 
!
!     Note that this routine is not efficient by any stretch of the 
!     imagination --- only one index can be converted at a time.
!     In addition, a search procedure must be performed, whose 
!     efficiency is inversely proportional to the size of the 
!     decomposition (in particular, to the number of "runs").  
!     Conceptually this mapping should be used only once in the 
!     program for initialization, and subsequently all calculations 
!     should take place using local indices.
!
! !SYSTEM ROUTINES:
!     SIZE
!
! !REVISION HISTORY:
!   02.11.09   Sawyer     Creation from decomplocaltoglobal
!
!EOP
!------------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
    INTEGER  :: I, J, Counter
    LOGICAL  :: Found
!
    CPP_ENTER_PROCEDURE( "DECOMPL2GVECTOR" )
    DO I=1,N
      CPP_ASSERT_F90( Pe(I) .GE. 0 )
      CPP_ASSERT_F90( Pe(I) .LT. SIZE(Decomp%Head) )
      CPP_ASSERT_F90( Local(I) .GT. 0 )
      CPP_ASSERT_F90( Local(I) .LE. Decomp%NumEntries(Pe(I)+1) )

      Counter = 0
      Found   = .FALSE.
      J = 0
      DO WHILE ( .NOT. Found )
        J = J+1
        Counter = Counter + Decomp%Head(Pe(I)+1)%EndTags(J) -               &
                            Decomp%Head(Pe(I)+1)%StartTags(J) + 1
        IF ( Local(I) .LE.  Counter ) THEN
          Found = .TRUE.
!
! The following calculation is not immediately obvious.  Think about it
!
          Global(I) = Local(I) - Counter + Decomp%Head(Pe(I)+1)%EndTags(J)
          Found = .TRUE.
        ELSEIF ( J .GE. SIZE( Decomp%Head(Pe(I)+1)%StartTags ) ) THEN
!
! Emergency brake
!
          Found = .TRUE.
          Global(I) = 0
        ENDIF
      ENDDO
    ENDDO

    CPP_LEAVE_PROCEDURE( "DECOMPL2GVECTOR" )
    RETURN
!
!EOC
    END SUBROUTINE DecompL2GVector
!------------------------------------------------------------------------


!------------------------------------------------------------------------
!BOP
! !IROUTINE: DecompInfo --- Information about decomposition
!
! !INTERFACE:

      SUBROUTINE DecompInfo( Decomp, Npes, TotalPts ) 16
! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      TYPE(DecompType), INTENT( IN ):: Decomp   ! Decomp information

! !OUTPUT PARAMETERS:
      INTEGER, INTENT( OUT )        :: Npes     ! Npes in decomposition
      INTEGER, INTENT( OUT )        :: TotalPts ! Total points in domain
!
!
! !DESCRIPTION:
!     Return information about the decomposition: the number of
!     PEs over which the domain is decomposed, and the size of
!     the domain.
!
! !REVISION HISTORY:
!   00.11.12   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
!
!
      CPP_ENTER_PROCEDURE( "DECOMPINFO" )

      Npes = SIZE( Decomp%Head )
      TotalPts = Decomp%GlobalSize

      CPP_LEAVE_PROCEDURE( "DECOMPINFO" )
      RETURN
!EOC
      END SUBROUTINE DecompInfo
!------------------------------------------------------------------------

      END MODULE decompmodule