#if !defined(STAND_ALONE)
#endif
#define _SMEMORY 1
!-----------------------------------------------------------------------
!         Nasa/GSFC, Data Assimilation Office, Code 910.3, GEOS/DAS
!-----------------------------------------------------------------------

      MODULE parutilitiesmodule 38
#if defined( SPMD )
!BOP
!
! !MODULE: parutilitiesmodule
!
! !USES:
#if defined( STAND_ALONE )
# define iulog 6
#else
      use cam_logfile, only: iulog
#endif
#if !defined(STAND_ALONE)
      USE shr_kind_mod, only: r8 => shr_kind_r8, i8 => shr_kind_i8, &
                              r4 => shr_kind_r4
#endif
      USE mod_comm, ONLY : commglobal, gid, numpro, blockdescriptor, max_nparcels
#include "debug.h"
      IMPLICIT NONE
#include "mpif.h"
#include "pilgrim.h"

!
! !PUBLIC DATA MEMBERS:
      PUBLIC     Gsize
      PUBLIC     INT4, REAL4, REAL8
      PUBLIC     SUMOP, MAXOP, MINOP, BCSTOP
       

      INTEGER,SAVE :: GSize        ! Size of communicator CommGlobal
                                   ! Equivalent to mod_comm::numpro
#define CPP_SUM_OP 101
#define CPP_MAX_OP 102
#define CPP_MIN_OP 103
#define CPP_BCST_OP 104

      INTEGER,SAVE :: INT4  = MPI_INTEGER
      INTEGER,SAVE :: REAL4 = MPI_REAL
      INTEGER,SAVE :: REAL8 = MPI_DOUBLE_PRECISION
      INTEGER,SAVE :: SUMOP = MPI_SUM
      INTEGER,SAVE :: MAXOP = MPI_MAX
      INTEGER,SAVE :: MINOP = MPI_MIN
      INTEGER,SAVE :: BCSTOP = CPP_BCST_OP

! !PUBLIC MEMBER FUNCTIONS:
      PUBLIC ParPatternType

      TYPE ParPatternType
        INTEGER ::     Comm                  ! Communicator
        INTEGER ::     Iam                   ! My rank in communicator
        INTEGER ::     Size                  ! Size of communicator
        TYPE(BlockDescriptor), POINTER :: SendDesc(:) ! Array of descriptors
        TYPE(BlockDescriptor), POINTER :: RecvDesc(:) ! Array of descriptors
      END TYPE ParPatternType 


#ifdef _SMEMORY
      TYPE ParInfoType
        INTEGER :: numRecvSeg               ! number of received segments
        INTEGER :: numSendSeg               ! number of send segments
        INTEGER :: maxNumSeg                ! maximum number of segments over all processors
        INTEGER :: numRecvNeigh             ! number of receive neighbors
        INTEGER :: numSendNeigh             ! number of send neighbors
      END TYPE ParInfoType
#endif

      PUBLIC     ParInit, ParSplit, ParFree, ParExit
      PUBLIC     ParScatter, ParGather
      PUBLIC     ParBeginTransfer, ParEndTransfer
      PUBLIC     ParExchangeVector, ParCollective
      PUBLIC     ParPatternCreate, ParPatternFree


      INTERFACE     ParPatternCreate 31
        MODULE PROCEDURE ParPatternCopy
        MODULE PROCEDURE ParPatternGhost
        MODULE PROCEDURE ParPatternDecompToDecomp
        MODULE PROCEDURE ParPatternDecompToGhost
        MODULE PROCEDURE ParPatternGhostToDecomp
        MODULE PROCEDURE ParPatternGhostToGhost
      END INTERFACE
 

      INTERFACE     ParScatter 1
        MODULE PROCEDURE ParScatterReal
        MODULE PROCEDURE ParScatterReal4
        MODULE PROCEDURE ParScatterInt
      END INTERFACE
 

      INTERFACE     ParGather 1
        MODULE PROCEDURE ParGatherReal
        MODULE PROCEDURE ParGatherReal4
        MODULE PROCEDURE ParGatherInt
      END INTERFACE


      INTERFACE     ParBeginTransfer 3
        MODULE PROCEDURE ParBeginTransferReal
        MODULE PROCEDURE ParBeginTransferPattern1D
        MODULE PROCEDURE ParBeginTransferPattern1Dint
        MODULE PROCEDURE ParBeginTransferPattern2D
        MODULE PROCEDURE ParBeginTransferPattern3D
        MODULE PROCEDURE ParBeginTransferPattern4D
!        MODULE PROCEDURE ParBeginTransferInt
      END INTERFACE


      INTERFACE     ParEndTransfer 3
        MODULE PROCEDURE ParEndTransferReal
        MODULE PROCEDURE ParEndTransferPattern1D
        MODULE PROCEDURE ParEndTransferPattern1Dint
        MODULE PROCEDURE ParEndTransferPattern2D
        MODULE PROCEDURE ParEndTransferPattern3D
        MODULE PROCEDURE ParEndTransferPattern4D
!        MODULE PROCEDURE ParEndTransferInt
      END INTERFACE


      INTERFACE     ParExchangeVector 8
        MODULE PROCEDURE ParExchangeVectorReal
        MODULE PROCEDURE ParExchangeVectorReal4
        MODULE PROCEDURE ParExchangeVectorInt
      END INTERFACE


      INTERFACE     ParCollective 18
        MODULE PROCEDURE ParCollectiveBarrier
        MODULE PROCEDURE ParCollective0D
        MODULE PROCEDURE ParCollective1D
        MODULE PROCEDURE ParCollective1DReal4
        MODULE PROCEDURE ParCollective2D
        MODULE PROCEDURE ParCollective2DReal4
        MODULE PROCEDURE ParCollective3D
        MODULE PROCEDURE ParCollective0DInt
        MODULE PROCEDURE ParCollective0DStr
        MODULE PROCEDURE ParCollective1DInt
        MODULE PROCEDURE ParCollective1DStr
        MODULE PROCEDURE ParCollective2DInt
      END INTERFACE

#ifdef _SMEMORY

      INTERFACE   ParCalcInfo 4
        MODULE PROCEDURE ParCalcInfoDecompToGhost
        MODULE PROCEDURE ParCalcInfoDecompToDecomp
        MODULE PROCEDURE ParCalcInfoGhostToGhost
        MODULE PROCEDURE ParCalcInfoGhostToDecomp
      END INTERFACE
#endif

!
! !DESCRIPTION:
!
!      This module provides the basic utilities to support parallelism
!      on a distributed or shared memory multiprocessor.
!
!      \begin{center}
!      \begin{tabular}{|l|l|} \hline \hline
!        ParInit           & Initialize the parallel system \\ \hline
!        ParExit           & Exit from the parallel system \\ \hline
!        ParSplit          & Create a Compute grid of PEs   \\ \hline
!        ParFree           & Free a split communicator \\ \hline
!        ParScatter        & Scatter global slice to local slices \\ \hline
!        ParGather         & Gather local slices to one global \\ \hline
!        ParBeginTransfer  & Initiate an all-to-all packet transfer \\ \hline
!        ParEndTransfer    & Complete an all-to-all packet transfer \\ \hline
!        ParExchangeVector & Complete an all-to-all packet transfer \\ \hline
!        ParCollective     & Collective operation across communicator \\ \hline
!      \end{tabular}
!      \end{center}
!      \vspace{2mm}
!
!      Other utilities can be added to this module as needs evolve.
!
!      Conceptually the intention is to aggregate as many of the
!      MPI communication calls as possible into a well-maintained
!      module.  This will help avoid the occurrence of MPI spaghetti 
!      code.  
!
!      This module is tailored to GEOS DAS and implements the 
!      design of Lucchesi/Mirin/Sawyer/Larson.
!
! !REVISION HISTORY:
!   97.02.01   Sawyer     Creation
!   97.07.22   Sawyer     Removal of DecompType related subroutines
!   97.08.13   Sawyer     Added ParScatter/Gather for Integers
!   97.09.26   Sawyer     Additions of Sparse communication primitives
!   97.12.01   Sawyer     Changed all MPI_SSEND to MPI_ISEND
!   97.12.23   Lucchesi   Added member variables IsIONode and InterComm
!   98.01.06   Sawyer     Additions from RL for I/O Nodes
!   98.02.02   Sawyer     Added the Cartesian data members
!   98.02.05   Sawyer     Removed the use of intercommunicators
!   98.02.23   Sawyer     Added ghosting utilities
!   98.02.25   Sawyer     Modified interface of BeginTransfer
!   98.03.03   Sawyer     Added Global ID number to public data members
!   98.03.25   Sawyer     Added documentation for walkthrough
!   98.04.16   Sawyer     Removed all use of MPI_CART (CommRow redefined)
!   98.07.23   Sawyer     Added ParGhost, ParPoleDot; ParBegin/EndGhost out
!   98.09.15   Sawyer     Added ParMerge, ParPoleGhost
!   98.09.17   Sawyer     Added ParSum, removed ParPoleDot
!   99.01.18   Sawyer     Minor cleaning
!   99.03.04   Sawyer     Revised SHMEM concept for Transfer
!   99.04.22   Sawyer     Removed COMMON for handles -- they are
!                         always used in same program unit.
!   99.05.21   Sawyer     Reintroduced barriers in Scatter/Gather
!   99.06.03   Sawyer     USE_SHMEM revisions
!   99.12.10   Sawyer     ParInit now sets GID, Gsize
!   99.12.13   Sawyer     Version slimmed down for FVCCM release
!   00.06.14   Sawyer     Precision module now used
!   00.07.07   Sawyer     Removed 2D scatter/gather; simplified API
!   00.07.30   Sawyer     Full implementation with shared memory
!   00.08.09   Sawyer     Replaced ParSum with ParCollective
!   00.08.28   Sawyer     Moved LLNL 2D data to LLNL2DModule; new MLP impl
!   01.02.04   Sawyer     Added PatternType and related routines
!   01.02.12   Sawyer     Converted to free format
!   02.10.30   Sawyer     Welded with mod_comm
!   03.03.06   Sawyer     Fix parpatterncreate for MPI2; use MPI_DATATYPE_NULL
!   05.10.12   Worley     Support for vectorization modifications in mod_comm
!   06.03.01   Sawyer     Merged CAM and GEOS5 versions
!   07.01.05   Mirin      Eliminated direct use of Gsize
!   07.09.04   Dennis     Reduced temporary memory usage
!
! !BUGS:
!   There are several MPI_Barriers at locations in the code.
!   These avoid potential race conditions which probably only occur
!   if the number of real processors is less than the number of
!   message passing processes.  Remove these barriers at your own risk
!
!EOP

      INTEGER, SAVE :: InHandle(MAX_PAX, MAX_SMP, MAX_TRF)
      INTEGER, SAVE :: OutHandle(MAX_PAX,MAX_SMP, MAX_TRF)
      INTEGER, SAVE :: BegTrf = 0  ! Ongoing overlapped begintransfer # 
      INTEGER, SAVE :: EndTrf = 0  ! Ongoing overlapped endtransfer #
      LOGICAL, SAVE :: Initialized = .FALSE. ! Flag for initialization of parutilitiesmodule.

      CONTAINS
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParInit --- Initialize the parallel execution
!
! !INTERFACE: 

      SUBROUTINE ParInit ( Comm, npryzxy, mod_method, mod_geopk, mod_gatscat, mod_maxirr ) 1,2
!
! !USES:
      USE mod_comm, ONLY : mp_init
      IMPLICIT NONE
! !INPUT PARAMETERS:
      INTEGER, OPTIONAL  :: Comm
      INTEGER, OPTIONAL, INTENT(IN) :: npryzxy(4)      ! 2D decompositions
      INTEGER, OPTIONAL, INTENT(IN) :: mod_method      ! CAM optimization
      INTEGER, OPTIONAL, INTENT(IN) :: mod_geopk       ! CAM optimization
      INTEGER, OPTIONAL, INTENT(IN) :: mod_gatscat     ! CAM optimization
      INTEGER, OPTIONAL, INTENT(IN) :: mod_maxirr      ! CAM max simul. trsps.

!
! !DESCRIPTION:
!     Initializes the system.  In MPI mode, call MPI\_INIT if not done 
!     already. If the optional arguments are not provided, default 
!     values will be chosen.  But it is advisable to provide COMM
!     (main communicator) and NPRYZXY (internal 2D decomposition).
!
! !SYSTEM ROUTINES:
!     MPI_INITIALIZED, MPI_INIT
!
! !REVISION HISTORY:
!   97.03.20   Sawyer     Creation
!   97.04.16   Sawyer     Cleaned up for walk-through
!   97.07.03   Sawyer     Reformulated documentation
!   00.07.23   Sawyer     Added shared memory arena implementation
!   02.10.30   Sawyer     Now uses mp_init from mod_comm
!   06.06.15   Sawyer     Added CAM optimizations (passed to mod_comm)
!
!EOP
!-----------------------------------------------------------------------
!BOC

! Initialize mod_comm

      IF (.NOT. Initialized) THEN
         CALL mp_init( Comm, npryzxy, mod_method, mod_geopk, mod_gatscat, mod_maxirr )
         Gsize = numpro   !   Support PILGRIM's Gsize for now
         Initialized = .TRUE.
      ENDIF

      RETURN
!EOC
      END SUBROUTINE ParInit
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParExit --- Finalize the parallel execution
!
! !INTERFACE:

      SUBROUTINE ParExit ( Comm ),2

! !USES:
      USE mod_comm, ONLY: mp_exit
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, OPTIONAL  :: Comm

! !DESCRIPTION:
!     All PEs, compute nodes and IO nodes alike meet here to terminate
!     themselves.  If someone does not check in, everything will hang
!     here.
!
!     This routine is the very {\em last} thing which is executed!
!
! !LOCAL VARIABLES:
      INTEGER Ierror
!
! !SYSTEM ROUTINES:
!     MPI_BARRIER, MPI_FINALIZE
!
! !REVISION HISTORY:
!   97.03.20   Sawyer     Creation
!   97.04.16   Sawyer     Cleaned up for walk-through
!   97.07.03   Sawyer     Reformulated documentation
!   00.07.23   Sawyer     Added shared memory arena implementation
!   02.08.13   Sawyer     Incorporated mod_comm for low level comm.
!
!EOP
!-----------------------------------------------------------------------
!BOC
      CALL mp_exit(Comm)
      RETURN
!EOC
      END SUBROUTINE ParExit
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE:   ParSplit --- Split into group for I/O and computation
!
! !INTERFACE:

      SUBROUTINE ParSplit( InComm, Color, InID, Comm, MyID, Nprocs ) 8
!
! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )     :: InComm    ! Communicator to split
      INTEGER, INTENT( IN )     :: Color     ! Group label
      INTEGER, INTENT( IN )     :: InID      ! Input ID

! !OUTPUT PARAMETERS:
      INTEGER, INTENT( OUT )    :: Comm      ! Split communicator
      INTEGER, INTENT( OUT )    :: MyID      ! Group label
      INTEGER, INTENT( OUT )    :: Nprocs    ! Number of PEs in my group
!
! !DESCRIPTION:
!     This routine splits the PEs into groups.  This is currently only
!     supported in MPI mode. Read the chapter on MPI\_COMM\_SPLIT 
!     thoroughly.  
!
! !SYSTEM ROUTINES:
!     MPI_COMM_SPLIT, MPI_COMM_SIZE, MPI_COMM_RANK
!
! !REVISION HISTORY:
!   97.03.20   Sawyer     Creation
!   97.04.16   Sawyer     Cleaned up for walk-through
!   97.07.03   Sawyer     Reformulated documentation
!   97.12.01   Sawyer     Xnodes and Ynodes are explicit arguments
!   97.12.23   Lucchesi   Added call to MPI_INTERCOMM_CREATE
!   98.01.06   Sawyer     Additions from RL for I/O Nodes
!   98.02.02   Sawyer     Added the Cartesian information
!   98.02.05   Sawyer     Removed the use of intercommunicators
!   98.04.16   Sawyer     Removed all use of MPI_CART (CommRow redefined)
!   99.01.10   Sawyer     CommRow now defined for all rows
!   00.07.09   Sawyer     Removed 2D computational mesh
!   00.08.08   Sawyer     Redefined as wrapper to mpi_comm_split
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER  Ierror

      CPP_ENTER_PROCEDURE( "PARSPLIT" )
!
!     Split the communicators
!
      CALL MPI_COMM_SPLIT( InComm, Color, InID, Comm, Ierror )
      IF ( Comm .ne. MPI_COMM_NULL ) THEN
        CALL MPI_COMM_RANK( Comm, MyID, Ierror )
        CALL MPI_COMM_SIZE( Comm, Nprocs, Ierror )
      ELSE
!
!     This PE does not participate: mark with impossible values
!
        MyID = -1
        Nprocs = -1
      ENDIF

      CPP_LEAVE_PROCEDURE( "PARSPLIT" )
      RETURN
!EOC
      END SUBROUTINE ParSplit
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE:   ParFree --- Free a communicator
!
! !INTERFACE:

      SUBROUTINE ParFree( InComm ) 
!
! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER InComm

!
! !DESCRIPTION:
!     This routine frees a communicator created with ParSplit
!
! !REVISION HISTORY:
!   97.09.11   Sawyer     Creation, to complement ParSplit
!   00.07.24   Sawyer     Revamped ParMerge into a free communicator 
!
! !LOCAL VARIABLES:
      INTEGER  Ierror
!
!EOP
!-----------------------------------------------------------------------
!BOC
      CPP_ENTER_PROCEDURE( "PARFREE" )
!
      CALL MPI_COMM_FREE( InComm, Ierror ) 
      CPP_LEAVE_PROCEDURE( "PARFREE" )
      RETURN
!EOC
      END SUBROUTINE ParFree
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE:   ParPatternCopy --- Duplicate/replicate a comm pattern
!
! !INTERFACE:

      SUBROUTINE ParPatternCopy( InComm, PatternIn, PatternOut, Multiplicity ) 1,2
!
! !USES:
      USE mod_comm, ONLY : get_partneroffset
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER,  INTENT( IN )               :: InComm  ! # of PEs
      TYPE(ParPatternType), INTENT( IN )   :: PatternIn   ! Comm Pattern
      INTEGER, INTENT( IN ),  OPTIONAL     :: Multiplicity

! !OUTPUT PARAMETERS:
      TYPE(ParPatternType), INTENT( OUT )  :: PatternOut  ! Comm Pattern
!
! !DESCRIPTION:
!     This routine duplicates a given communication pattern. 
!
!     Optionally a multiplicity can be added.  This replicates the
!     communication pattern Mult times, that is for the case that
!     the data structures are replicated in the final dimension
!     Mult times.  A typical example is a pattern describing a 2D
!     array, e.g. a a lat-lon decomposition, which will be used
!     to copy a 3D lat-lon-lev array.  The strides (e.g. the number
!     of elements in one plane) of the source (send) and target 
!     (recv) arrays are now calculated internally.
!
! !SYSTEM ROUTINES:
!     MPI_TYPE_UB, MPI_TYPE_HVECTOR, MPI_TYPE_COMMIT
!
! !REVISION HISTORY:
!   03.03.20   Sawyer     Creation
!   03.06.26   Sawyer     Removed StrideSend/Recv from API
!
!EOP 
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER  Stride_S, Stride_R, Mult, Iam, GroupSize, Ipe, Ierror
      INTEGER  Disp, Length, I, J, ub, method

      CPP_ENTER_PROCEDURE( "PARPATTERNCOPY" )

      method = PatternIn%RecvDesc(1)%method

!
! Decide if this is a simple copy, or a multiple replication
!
      IF ( present(Multiplicity) ) THEN
          Mult = Multiplicity
      ELSE
          Mult = 1
      ENDIF

      CALL MPI_COMM_DUP( PatternIn%Comm, PatternOut%Comm, Ierror )
      CALL MPI_COMM_SIZE( PatternIn%Comm, GroupSize, Ierror )
      CALL MPI_COMM_RANK( PatternIn%Comm, Iam, Ierror )

      PatternOut%Iam  = Iam
      PatternOut%Size = GroupSize

      ALLOCATE( PatternOut%SendDesc( GroupSize ) )
      ALLOCATE( PatternOut%RecvDesc( GroupSize ) )

      PatternOut%SendDesc(:)%method = PatternIn%SendDesc(:)%method
      PatternOut%RecvDesc(:)%method = PatternIn%RecvDesc(:)%method
!
! Determine the strides which are by construction the maximum upper
! bound of all the derived types.  This is due to the fact that
! there are no 'holes' in the data types: even if one PE does not
! send to any other PEs, it will still have a data type for 'sending'
! data to itself.
!
        Stride_S = 0
        Stride_R = 0       
        DO Ipe=1, GroupSize
          IF ( PatternIn%SendDesc(Ipe)%type /= MPI_DATATYPE_NULL ) THEN
            CALL MPI_TYPE_UB( PatternIn%SendDesc(Ipe)%type, ub, ierror )
            Stride_S = max(Stride_S,ub)
          ENDIF
          IF ( PatternIn%RecvDesc(Ipe)%type /= MPI_DATATYPE_NULL ) THEN
            CALL MPI_TYPE_UB( PatternIn%RecvDesc(Ipe)%type, ub, ierror )
            Stride_R = max(Stride_R,ub)
          ENDIF
        ENDDO

!
! Determine the output data types
!
        DO Ipe=1, GroupSize
          IF ( PatternIn%SendDesc(ipe)%type /= MPI_DATATYPE_NULL ) THEN
            CALL MPI_TYPE_HVECTOR( Mult, 1, Stride_S, PatternIn%SendDesc(Ipe)%type,&
                                   PatternOut%SendDesc(Ipe)%type, Ierror )
            CALL MPI_TYPE_COMMIT( PatternOut%SendDesc(Ipe)%type, Ierror )
          ELSE
            PatternOut%SendDesc(ipe)%type = MPI_DATATYPE_NULL
          ENDIF
          IF ( PatternIn%RecvDesc(Ipe)%type /= MPI_DATATYPE_NULL ) THEN
            CALL MPI_TYPE_HVECTOR( Mult, 1, Stride_R, PatternIn%RecvDesc(Ipe)%type,&
                                   PatternOut%RecvDesc(Ipe)%type, Ierror )
            CALL MPI_TYPE_COMMIT( PatternOut%RecvDesc(Ipe)%type, Ierror )
          ELSE
            PatternOut%RecvDesc(ipe)%type = MPI_DATATYPE_NULL
          ENDIF
        ENDDO

!
! Determine the stride, which is the sum of all the blocksizes for all
! the derived types (there are no 'holes').
!      
        Stride_S = 0
        Stride_R = 0       
        DO Ipe=1, GroupSize
          Stride_S = Stride_S + sum( PatternIn%SendDesc(ipe)%BlockSizes(:) )
          Stride_R = Stride_R + sum( PatternIn%RecvDesc(ipe)%BlockSizes(:) )
        ENDDO

        DO ipe=1, GroupSize
          Length = SIZE(PatternIn%SendDesc(ipe)%BlockSizes) 
          ALLOCATE( PatternOut%SendDesc(ipe)%Displacements(Length*Mult) )
          ALLOCATE( PatternOut%SendDesc(ipe)%BlockSizes(Length*Mult) )
#if defined( DEBUG_PARPATTERNCOPY )
          write(iulog,*) "Multiplicity", Mult
          write(iulog,*) "Old send blocksizes", PatternIn%SendDesc(ipe)%BlockSizes
#endif
          DO i=1, Length
            Disp = PatternIn%SendDesc(ipe)%Displacements(i)
            DO j=1, Mult
              PatternOut%SendDesc(ipe)%BlockSizes(i+(j-1)*Length) =     &
                    PatternIn%SendDesc(ipe)%BlockSizes(i)
              PatternOut%SendDesc(ipe)%Displacements(i+(j-1)*Length) = Disp
              Disp = Disp + Stride_S
            ENDDO
          ENDDO
          PatternOut%SendDesc(ipe)%Nparcels  = &
            size (PatternOut%SendDesc(ipe)%Displacements)
          PatternOut%SendDesc(ipe)%Tot_Size = &
            sum  (PatternOut%SendDesc(ipe)%Blocksizes)
          Max_Nparcels = max (Max_Nparcels, PatternOut%SendDesc(ipe)%Nparcels)
#if defined( DEBUG_PARPATTERNCOPY )
          write(iulog,*) "Send blocksizes", PatternOut%SendDesc(ipe)%BlockSizes
          write(iulog,*) "Old recv blocksizes", PatternIn%RecvDesc(ipe)%BlockSizes
#endif
          Length = SIZE(PatternIn%RecvDesc(ipe)%BlockSizes) 
          ALLOCATE( PatternOut%RecvDesc(ipe)%Displacements(Length*Mult) )
          ALLOCATE( PatternOut%RecvDesc(ipe)%BlockSizes(Length*Mult) )
          DO i=1, Length
            Disp = PatternIn%RecvDesc(ipe)%Displacements(i)
            DO j=1, Mult
              PatternOut%RecvDesc(ipe)%BlockSizes(i+(j-1)*Length) =     &
                    PatternIn%RecvDesc(ipe)%BlockSizes(i)
              PatternOut%RecvDesc(ipe)%Displacements(i+(j-1)*Length) = Disp
              Disp = Disp + Stride_R
            ENDDO
          ENDDO
          PatternOut%RecvDesc(ipe)%Nparcels  = &
            size (PatternOut%RecvDesc(ipe)%Displacements)
          PatternOut%RecvDesc(ipe)%Tot_Size = &
            sum  (PatternOut%RecvDesc(ipe)%Blocksizes)
          Max_Nparcels = max (Max_Nparcels, PatternOut%RecvDesc(ipe)%Nparcels)
#if defined( DEBUG_PARPATTERNCOPY )
          write(iulog,*) "Recv blocksizes", PatternOut%RecvDesc(ipe)%BlockSizes
#endif
        ENDDO

        CALL get_partneroffset( InComm, PatternOut%SendDesc, PatternOut%RecvDesc )
      
      CPP_LEAVE_PROCEDURE( "PARPATTERNCOPY" )
      RETURN
!EOC
      END SUBROUTINE ParPatternCopy
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !IROUTINE:   ParPatternGhost --- Create pattern for given ghosting
!
! !INTERFACE:

      SUBROUTINE ParPatternGhost( InComm, Ghost, Pattern, mod_method, T ) 1,8
!
! !USES:
      USE decompmodule, ONLY : DecompGlobalToLocal, DecompLocalToGlobal
      USE ghostmodule, ONLY : GhostType, GhostInfo
      USE mod_comm, ONLY : get_partneroffset
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER,  INTENT( IN )               :: InComm  ! # of PEs
      TYPE(GhostType),  INTENT( IN )       :: Ghost   ! # of PEs
      INTEGER,  INTENT( IN ), OPTIONAL     :: mod_method ! contiguous or derived type
      INTEGER, INTENT( IN ),  OPTIONAL     :: T       ! 

! !OUTPUT PARAMETERS:
      TYPE(ParPatternType), INTENT( OUT )  :: Pattern ! Comm Pattern
!
! !DESCRIPTION:
!     This routine contructs a communication pattern from the ghost
!     region definition.  That is, the resulting communication pattern
!     can be used in ParBegin/EndTransfer with the ghosted arrays as
!     inputs.  
!
! !SYSTEM ROUTINES:
!    MPI_COMM_SIZE,  MPI_COMM_RANK, MPI_COMM_DUP
!    MPI_TYPE_INDEXED, MPI_TYPE_COMMIT (depending on method)
!
! !REVISION HISTORY:
!   01.02.10   Sawyer     Creation
!   01.06.02   Sawyer     Renamed ParPatternGhost
!   02.06.27   Sawyer     Added data type "T" as optional argument
!   03.03.04   Sawyer     Set partneroffsets field
!   03.11.11   Mirin      Added optional argument mod_method
!
!EOP 
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER  i, j, ipe, pe, Iam, GroupSize, Num, Length, Ptr, Ierror
      INTEGER  Global, End, Local, GlobalSize, LocalSize, BorderSize
      INTEGER  DataType
      INTEGER, ALLOCATABLE :: InVector(:), OutVector(:)
      INTEGER, ALLOCATABLE :: LenInVector(:), LenOutVector(:)
      INTEGER              :: method

      CPP_ENTER_PROCEDURE( "PARPATTERNGHOST" )

      IF (present(T)) THEN
        DataType = T
      ELSE
        DataType = CPP_MPI_REAL8
      ENDIF

      IF (present(mod_method)) THEN
        method = mod_method
      ELSE
        method = 0     ! Default method - see mod_comm for description
      ENDIF
!
! First request the needed ghost values from other processors.
!
      CALL MPI_COMM_DUP( InComm, Pattern%Comm, Ierror )
      CALL MPI_COMM_SIZE( InComm, GroupSize, Ierror )
      CALL MPI_COMM_RANK( InComm, Iam, Ierror )

      Pattern%Iam  = Iam
      Pattern%Size = GroupSize

      ALLOCATE( Pattern%SendDesc( GroupSize ) )
      ALLOCATE( Pattern%RecvDesc( GroupSize ) )

      Pattern%SendDesc(:)%method = method
      Pattern%RecvDesc(:)%method = method

!
! Temporary variables
!
      ALLOCATE( LenInVector( GroupSize ) )
      ALLOCATE( LenOutVector( GroupSize ) )

      CALL GhostInfo( Ghost,GroupSize,GlobalSize,LocalSize,BorderSize )
      ALLOCATE( InVector( 2*BorderSize ) )
      ALLOCATE( OutVector( 2*LocalSize ) )

!
! A rather complicated loop to define the local ghost region.
! The concept is the following:  go through all the points in the
! border data structure.   It contains global indices of the points
! which have to be copied over from neighboring PEs.  These indices
! are collected into InVector for transmission to those PEs, in
! effect informing them of the local PEs requirements.
!
! A special case is supported:  if the ghost domain wraps around
! onto the domain of the local PE!  This is very tricky, because
! the index space in both Ghost%Border and Ghost%Local MUST be
! unique for DecompGlobalToLocal to work.   Solution:  ghost 
! points are marked with the negative value of the needed domain 
! value in both Ghost%Border and Ghost%Local.  These are "snapped 
! over" to the true global index with the ABS function, so that 
! they can be subsequently found in the true local domain.
!
      j = 1
      DO ipe=1, GroupSize
        Num = SIZE(Ghost%Border%Head(ipe)%StartTags)
        Length = 0
        DO i = 1, Num
          Global = Ghost%Border%Head(ipe)%StartTags(i)
          IF ( Global /= 0 ) THEN
            Length = Length + 1
            End    = Ghost%Border%Head(ipe)%EndTags(i)
            InVector(j) = ABS(Global)
            InVector(j+1) = ABS(End)
            CALL DecompGlobalToLocal( Ghost%Local, Global, Local, Pe )
            OutVector(Length) = Local-1                ! Zero-based address
            OutVector(Length+Num) = End - Global+1     ! Parcel size
            j = j + 2
          ENDIF
        ENDDO
        LenInVector(ipe) = 2*Length

!
! Set the receive buffer descriptor
!
#if defined(DEBUG_PARPATTERNGHOST)
        write(iulog,*) "Iam",Iam,"Pe",Ipe-1,"Lens",OutVector(Num+1:Num+Length), &
             "Displacements", OutVector(1:Length)
#endif

          IF ( Length > 0 .and. method > 0 ) THEN
            CALL MPI_TYPE_INDEXED( Length, OutVector(Num+1), OutVector,    &
                                   DataType, Ptr, Ierror )
            CALL MPI_TYPE_COMMIT( Ptr, Ierror )
            Pattern%RecvDesc(ipe)%type = Ptr
          ELSE
            Pattern%RecvDesc(ipe)%type = MPI_DATATYPE_NULL
          ENDIF

          ALLOCATE( Pattern%RecvDesc(ipe)%Displacements(Length) )
          ALLOCATE( Pattern%RecvDesc(ipe)%BlockSizes(Length) )
          DO i=1, Length
            Pattern%RecvDesc(ipe)%Displacements(i) = OutVector(i)
            Pattern%RecvDesc(ipe)%BlockSizes(i)    = OutVector(Num+i)
          ENDDO            
          Pattern%RecvDesc(ipe)%Nparcels  = &
            size (Pattern%RecvDesc(ipe)%Displacements)
          Pattern%RecvDesc(ipe)%Tot_Size = &
            sum  (Pattern%RecvDesc(ipe)%Blocksizes)
          Max_Nparcels = max (Max_Nparcels, Pattern%RecvDesc(ipe)%Nparcels)

      ENDDO

!
! Everybody exchanges the needed information
!
#if defined(DEBUG_PARPATTERNGHOST)
      write(iulog,*) "iam", iam, "In", LenInVector,                            &
                InVector( 1:SUM(LenInVector) )
#endif
      CALL ParExchangeVectorInt( InComm, LenInVector, InVector,          &
                                     LenOutVector, OutVector )
#if defined(DEBUG_PARPATTERNGHOST)
      write(iulog,*) "iam", iam, "Out", LenOutVector,                          &
                OutVector( 1:SUM(LenOutVector) )
#endif

!
! Now everyone has the segments which need to be sent to the 
! immediate neighbors.  Save these in PatternType.
!
      j = 1
      DO ipe = 1, GroupSize
        Num = LenOutVector(ipe) / 2
        DO i = 1, Num
          CALL DecompGlobalToLocal( Ghost%Local,OutVector(j),Local,pe )
          InVector(i) = Local-1
          InVector(i+Num) = OutVector(j+1) - OutVector(j) + 1
          j = j + 2
        ENDDO
#if defined(DEBUG_PARPATTERNGHOST)
        write(iulog,*) "Iam", Iam, "To", ipe-1, "InVector",                    &
              InVector(1:Num), "block size", InVector(Num+1:2*Num)
#endif

          IF ( Num > 0 .and. method > 0 ) THEN
            CALL MPI_TYPE_INDEXED( Num, InVector(Num+1), InVector,         &
                                   DataType, Ptr, Ierror )
            CALL MPI_TYPE_COMMIT( Ptr, Ierror )
            Pattern%SendDesc(ipe)%type = Ptr
          ELSE
            Pattern%SendDesc(ipe)%type = MPI_DATATYPE_NULL
          ENDIF

          ALLOCATE( Pattern%SendDesc(ipe)%Displacements(Num) )
          ALLOCATE( Pattern%SendDesc(ipe)%BlockSizes(Num) )
          DO i=1, Num
            Pattern%SendDesc(ipe)%Displacements(i) = InVector(i)
            Pattern%SendDesc(ipe)%BlockSizes(i)    = InVector(Num+i)
          ENDDO            
          Pattern%SendDesc(ipe)%Nparcels  = &
            size (Pattern%SendDesc(ipe)%Displacements)
          Pattern%SendDesc(ipe)%Tot_Size = &
            sum  (Pattern%SendDesc(ipe)%Blocksizes)
          Max_Nparcels = max (Max_Nparcels, Pattern%SendDesc(ipe)%Nparcels)

      ENDDO

      CALL get_partneroffset( InComm, Pattern%SendDesc, Pattern%RecvDesc )

!
! Clean up the locally allocate variables
!
      DEALLOCATE( OutVector )
      DEALLOCATE( InVector )
      DEALLOCATE( LenOutVector )
      DEALLOCATE( LenInVector )

      CPP_LEAVE_PROCEDURE( "PARPATTERNGHOST" )
      RETURN
!EOC
      END SUBROUTINE ParPatternGhost
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !IROUTINE:   ParPatternDecompToDecomp --- Create pattern between decomps
!
! !INTERFACE:

      SUBROUTINE ParPatternDecompToDecomp( InComm, DA, DB, Pattern, mod_method, T ) 1,9
!
! !USES:
      USE decompmodule, ONLY : DecompType, DecompGlobalToLocal, DecompInfo
      USE mod_comm, ONLY : get_partneroffset
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER,  INTENT( IN )               :: InComm  ! # of PEs
      TYPE(DecompType),  INTENT( IN )      :: DA      ! Source Decomp Desc
      TYPE(DecompType),  INTENT( IN )      :: DB      ! Target Decomp Desc
      INTEGER,  INTENT( IN ), OPTIONAL     :: mod_method ! contiguous or derived type
      INTEGER, INTENT( IN ),  OPTIONAL     :: T       ! 

! !OUTPUT PARAMETERS:
      TYPE(ParPatternType), INTENT( OUT )  :: Pattern ! Comm Pattern
!
! !DESCRIPTION:
!     This routine contructs a communication pattern for a 
!     transformation from one decomposition to another, i.e., a 
!     so-called "transpose". The resulting communication pattern 
!     can be used in ParBegin/EndTransfer with the decomposed 
!     arrays as inputs.  
!
! !SYSTEM ROUTINES:
!    MPI_COMM_SIZE,  MPI_COMM_RANK, MPI_COMM_DUP
!    MPI_TYPE_INDEXED, MPI_TYPE_COMMIT (depending on method)
!
! !REVISION HISTORY:
!   01.05.29   Sawyer     Creation from RedistributeCreate
!   01.07.13   Sawyer     Rewritten to minimize DecompGlobalToLocal
!   02.07.16   Sawyer     Added data type T
!   03.11.11   Mirin      Added optional argument mod_method
!   07.03.11   Mirin      Generalized to different sized decompositions
!   07.09.04   Dennis     Reduced amount of temporary memory usage
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER I, J, Tag, Local, Pe, LenB, JB, Ipe, Num, Inc, Off
      INTEGER Ptr                                ! Pointer type
      INTEGER GroupSize, Iam, Ierror, DataType
      INTEGER OldPe, TotalPtsA, NpesA, TotalPtsB, NpesB
      INTEGER              :: method
      INTEGER              :: nCount,maxCount,ierr,sz
      INTEGER              :: lenBjmd,nNeigh,maxLenB,maxNeigh
#ifdef _SMEMORY
      TYPE (ParInfoType) :: Info
#endif

      INTEGER, ALLOCATABLE :: Count(:)           ! # segments for each recv PE
      INTEGER, ALLOCATABLE :: CountOut(:)        ! # segments for each send PE

      INTEGER, ALLOCATABLE :: DisplacementsA(:)  ! Generic displacements
      INTEGER, ALLOCATABLE :: BlockSizesA(:)     ! Generic block sizes
      INTEGER, ALLOCATABLE :: LocalA(:)          ! Generic Local indices

      INTEGER, ALLOCATABLE :: DisplacementsB(:)  ! Displacements for B
      INTEGER, ALLOCATABLE :: BlockSizesB(:)     ! Block sizes for B
      INTEGER, ALLOCATABLE :: LocalB(:)          ! Local indices for B
      INTEGER, ALLOCATABLE :: PeB(:)             ! Processor element numbers

      CPP_ENTER_PROCEDURE( "PARPATTERNDECOMPTODECOMP" )

      IF (present(T)) THEN
        DataType = T
      ELSE
        DataType = CPP_MPI_REAL8
      ENDIF

      IF (present(mod_method)) THEN
        method = mod_method
      ELSE
        method = 0     ! Default method - see mod_comm for description
      ENDIF

! Assume this routine is called by processes [ 0,max(NpesA,NpesB) )

      CALL DecompInfo( DA, NpesA, TotalPtsA )
      CALL DecompInfo( DB, NpesB, TotalPtsB )

      CALL MPI_COMM_SIZE( InComm, GroupSize, Ierror )
      CALL MPI_COMM_RANK( InComm, Iam, Ierror )
      CALL MPI_COMM_DUP( InComm, Pattern%Comm, Ierror )

#ifdef _SMEMORY
! Calculate info about the pattern 
      call ParCalcInfo(InComm,DA,DB, Info)
      TotalPtsA=Info%maxNumSeg
      TotalPtsB=Info%maxNumSeg
#endif

      Pattern%Size = GroupSize
      Pattern%Iam  = Iam
!
! Allocate the number of entries and list head arrays
!

!
! Allocate the patterns
!
      ALLOCATE( Pattern%SendDesc( NpesB ) )
      Pattern%SendDesc(:)%method = method
      if (iam .ge. NpesA) then
         do ipe = 1, NpesB
            ALLOCATE( Pattern%SendDesc(ipe)%Displacements(1) )
            ALLOCATE( Pattern%SendDesc(ipe)%BlockSizes(1) )
            Pattern%SendDesc(ipe)%Tot_Size = -1
            Pattern%SendDesc(ipe)%Nparcels = -1
            Pattern%SendDesc(ipe)%type = MPI_DATATYPE_NULL
            Pattern%SendDesc(ipe)%Displacements(1) = -1
            Pattern%SendDesc(ipe)%Blocksizes(1) = -1
         enddo
      endif

      ALLOCATE( Pattern%RecvDesc( NpesA ) )
      Pattern%RecvDesc(:)%method = method
      if (iam .ge. NpesB) then
         do ipe = 1, NpesA
            ALLOCATE( Pattern%RecvDesc(ipe)%Displacements(1) )
            ALLOCATE( Pattern%RecvDesc(ipe)%BlockSizes(1) )
            Pattern%RecvDesc(ipe)%Tot_Size = -1
            Pattern%RecvDesc(ipe)%Nparcels = -1
            Pattern%RecvDesc(ipe)%type = MPI_DATATYPE_NULL
            Pattern%RecvDesc(ipe)%Displacements(1) = -1
            Pattern%RecvDesc(ipe)%Blocksizes(1) = -1
         enddo
      endif

!
! Local allocations
!
      ALLOCATE( DisplacementsA( TotalPtsA ) )   ! Allocate for worst case
      ALLOCATE( BlockSizesA( TotalPtsA ) )      ! Allocate for worst case
      ALLOCATE( LocalA( TotalPtsA ) )           ! Allocate for worst case

      ALLOCATE( DisplacementsB( TotalPtsB ) )   ! Allocate for worst case
      ALLOCATE( BlockSizesB( TotalPtsB ) )      ! Allocate for worst case
      ALLOCATE( LocalB( TotalPtsB ) )           ! Allocate for worst case
      ALLOCATE( PeB( TotalPtsB ) )              ! Allocate for worst case

      ALLOCATE( Count( GroupSize ) )
      ALLOCATE( CountOut( GroupSize ) )

      JB        = 0
      Count     = 0
      LenB      = 0
      LocalA      = 0   !  (needed for parexchangevector later)
      BlocksizesA = 0   !  (needed for parexchangevector later)

      Num    = 0
      Inc    = 0

    if (iam .lt. NpesB) then

!
! Parse through all the tags in the local segment
      DO J = 1, SIZE( DB%Head(iam+1)%StartTags )
        OldPe     = -1         ! Set PE undefined
        DO Tag=DB%Head(iam+1)%StartTags(J), DB%Head(iam+1)%EndTags(J)
!
! Determine the index and PE of this entry on A. This might be inlined later
!
          CALL DecompGlobalToLocal( DA, Tag, Local, Pe )

!
! If ipe-1 is my id, then this is an entry ipe will receive from Pe
!
          IF ( Pe /= OldPe ) THEN
            OldPe   = Pe
            IF ( jb > 0 ) THEN
              BlockSizesB(jb) = LenB
              LenB = 0
            ENDIF
            jb = jb+1                     ! increment the segment index
            DisplacementsB(jb) = Inc      ! Zero-based offset of local segment
            LocalB(jb) = Local-1          ! The local index (zero-based)
            PeB(jb) = Pe                  ! Note the ID of the sender
            Count(Pe+1) = Count(Pe+1)+1 ! Increment counter of segments
          ENDIF
          LenB = LenB+1                   ! Good -- segment is getting longer
          Inc = Inc+1                     ! Increment local index
        ENDDO
      ENDDO
!
! Clean up
!
      IF ( jb>0 ) BlockSizesB(jb) = LenB
#if defined(DEBUG_PARPATTERNDECOMPTODECOMP)
      write(iulog,*) iam, "BlockSizes", BlockSizesB(1:jb), DisplacementsB(1:jb), PeB(1:jb), Count
#endif

      CPP_ASSERT_F90( JB .LE. TotalPtsB )
!
! Now create the pattern from the displacements and block sizes
!
      Inc = 0
      DO ipe = 1, NpesA
!
! Find the segments which are relevant for the sender ipe
! Make compact arrays BlockSizes and Displacements 
!
        DO j = 1, jb
          IF ( PeB(j) == ipe-1 ) THEN
            Inc = Inc + 1
            BlockSizesA(Inc) = BlockSizesB(j)
            DisplacementsA(Inc) = DisplacementsB(j)
            LocalA(Inc)      = LocalB(j)
          ENDIF
        ENDDO
      ENDDO
      CPP_ASSERT_F90( Inc .LE. TotalPtsA )

!
! Create the receiver communication pattern
!
      Off = 0
      DO ipe = 1, NpesA
        Num = Count(ipe)
        if(Num >0) then 
#if defined(DEBUG_PARPATTERNDECOMPTODECOMP)
        write(iulog,*) "Receiver Iam", Iam, "Ipe", Ipe-1, "Num", Num,         &
                 "Displacements", DisplacementsA(Off+1:Off+Num),        &
                 "BlockSizes", BlockSizesA(Off+1:Off+Num)
#endif
        endif
          IF ( Num > 0 .and. method > 0 ) THEN

            CALL MPI_TYPE_INDEXED( Num, BlockSizesA(Off+1),             &
                                   DisplacementsA(Off+1),               &
                                   DataType, Ptr, Ierror )
            CALL MPI_TYPE_COMMIT( Ptr, Ierror )
            Pattern%RecvDesc(ipe)%type = Ptr
          ELSE
            Pattern%RecvDesc(ipe)%type = MPI_DATATYPE_NULL
          ENDIF

          ALLOCATE( Pattern%RecvDesc(ipe)%Displacements(Num) )
          ALLOCATE( Pattern%RecvDesc(ipe)%BlockSizes(Num) )
          DO i=1, Num
            Pattern%RecvDesc(ipe)%Displacements(i) = DisplacementsA(i+Off)
            Pattern%RecvDesc(ipe)%BlockSizes(i)    = BlockSizesA(i+Off)
          ENDDO
          Pattern%RecvDesc(ipe)%Nparcels  = &
            size (Pattern%RecvDesc(ipe)%Displacements)
          Pattern%RecvDesc(ipe)%Tot_Size = &
            sum  (Pattern%RecvDesc(ipe)%Blocksizes)
          Max_Nparcels = max (Max_Nparcels, Pattern%RecvDesc(ipe)%Nparcels)

        Off = Off + Num
      ENDDO

    endif !  (iam .lt. NpesB)

!
! Now communicate what the receiver is expecting from the sender
!
      CALL ParExchangeVectorInt( InComm, Count, LocalA,                 &
                                 CountOut, DisplacementsB  )
      CALL ParExchangeVectorInt( InComm, Count, BlockSizesA,            &
                                 CountOut, BlockSizesB )

!
! Sender A: BlockSizes and Displacements can now be stored
!

    if (iam .lt. NpesA) then

      Off = 0
      DO ipe=1, NpesB
        Num = CountOut(ipe)
        if(Num>0) then 
#if defined(DEBUG_PARPATTERNDECOMPTODECOMP)
        write(iulog,*) "Sender Iam", Iam, "Ipe", Ipe-1, "Num", Num,           &
                 "Displacements", DisplacementsB(Off+1:Off+Num),        &
                 "BlockSizes", BlockSizesB(Off+1:Off+Num)
#endif
        endif
          IF ( Num > 0 .and. method > 0 ) THEN
            CALL MPI_TYPE_INDEXED( Num, BlockSizesB(Off+1),             &
                                   DisplacementsB(Off+1),               &
                                   DataType, Ptr, Ierror )
            CALL MPI_TYPE_COMMIT( Ptr, Ierror )
            Pattern%SendDesc(ipe)%type = Ptr
          ELSE
            Pattern%SendDesc(ipe)%type = MPI_DATATYPE_NULL
          ENDIF

          ALLOCATE( Pattern%SendDesc(ipe)%Displacements(Num) )
          ALLOCATE( Pattern%SendDesc(ipe)%BlockSizes(Num) )
          DO i=1, Num
            Pattern%SendDesc(ipe)%Displacements(i) = DisplacementsB(i+Off)
            Pattern%SendDesc(ipe)%BlockSizes(i)    = BlockSizesB(i+Off)
          ENDDO
          Pattern%SendDesc(ipe)%Nparcels  =  &
            size (Pattern%SendDesc(ipe)%Displacements)
          Pattern%SendDesc(ipe)%Tot_Size = &
            sum  (Pattern%SendDesc(ipe)%Blocksizes)
          Max_Nparcels = max (Max_Nparcels, Pattern%SendDesc(ipe)%Nparcels)

        Off = Off + Num
      ENDDO

    endif !  (iam .lt. NpesA)

      CALL get_partneroffset( InComm, Pattern%SendDesc, Pattern%RecvDesc )
      
      DEALLOCATE( CountOut )
      DEALLOCATE( Count )

      DEALLOCATE( PeB )
      DEALLOCATE( LocalB )
      DEALLOCATE( BlockSizesB )
      DEALLOCATE( DisplacementsB )

      DEALLOCATE( LocalA )
      DEALLOCATE( BlockSizesA )
      DEALLOCATE( DisplacementsA )

      CPP_LEAVE_PROCEDURE( "PARPATTERNDECOMPTODECOMP" )
      RETURN
!EOC
      END SUBROUTINE ParPatternDecompToDecomp
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE:   ParPatternDecompToGhost --- Create pattern decomp to ghost
!
! !INTERFACE:

      SUBROUTINE ParPatternDecompToGhost( InComm, DA, GB, Pattern, mod_method, T ) 1,10
!
! !USES:
      USE decompmodule, ONLY : DecompType, DecompGlobalToLocal,         &
                               DecompInfo
      USE ghostmodule, ONLY : GhostType, GhostInfo
      USE mod_comm, ONLY : get_partneroffset
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER,  INTENT( IN )               :: InComm  ! # of PEs
      TYPE(DecompType),  INTENT( IN )      :: DA      ! Source Ghost Desc
      TYPE(GhostType),  INTENT( IN )       :: GB      ! Target Ghost Desc
      INTEGER,  INTENT( IN ), OPTIONAL     :: mod_method ! contiguous or derived type
      INTEGER, INTENT( IN ),  OPTIONAL     :: T       !

! !OUTPUT PARAMETERS:
      TYPE(ParPatternType), INTENT( OUT )  :: Pattern ! Comm Pattern
!
! !DESCRIPTION:
!     This routine contructs a communication pattern for a transformation
!     from decomposition to a ghosted decomposition, i.e., a so-called 
!     "transpose".  The resulting communication pattern can be used in 
!     ParBegin/EndTransfer with the decomposed arrays as inputs.  
!
! !SYSTEM ROUTINES:
!    MPI_COMM_SIZE,  MPI_COMM_RANK, MPI_COMM_DUP
!    MPI_TYPE_INDEXED, MPI_TYPE_COMMIT (depending on method)
!
! !REVISION HISTORY:
!   01.07.12   Sawyer     Creation from ParPatternDecompToDecomp
!   02.03.20   Sawyer     Bug fix: added OldLocal, increment Off
!   02.07.16   Sawyer     Added data type T
!   03.11.11   Mirin      Added optional argument mod_method
!   07.03.11   Mirin      Generalized to different sized decompositions
!   07.09.04   Dennis     Reduced amount of temporary memory usage
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER I, J, Tag, Local, Pe, LenB, JB, Ipe, Num, Inc, Off
      INTEGER Ptr                                ! Pointer type
      INTEGER GroupSize, Iam, Ierror
      INTEGER OldPe, OldLocal, TotalPtsA, NpesA
      INTEGER GlobalSizeB, LocalSizeB, BorderSizeB, NpesB
      INTEGER DataType
      INTEGER              :: method
      INTEGER              :: nCount, maxCount, ierr
#ifdef _SMEMORY
      TYPE (ParInfoType) :: Info
#endif

      INTEGER, ALLOCATABLE :: Count(:)           ! # segments for each recv PE
      INTEGER, ALLOCATABLE :: CountOut(:)        ! # segments for each send PE

      INTEGER, ALLOCATABLE :: DisplacementsA(:)  ! Generic displacements
      INTEGER, ALLOCATABLE :: BlockSizesA(:)     ! Generic block sizes
      INTEGER, ALLOCATABLE :: LocalA(:)          ! Generic Local indices

      INTEGER, ALLOCATABLE :: DisplacementsB(:)  ! Displacements for B
      INTEGER, ALLOCATABLE :: BlockSizesB(:)     ! Block sizes for B
      INTEGER, ALLOCATABLE :: LocalB(:)          ! Local indices for B
      INTEGER, ALLOCATABLE :: PeB(:)             ! Processor element numbers

      CPP_ENTER_PROCEDURE( "PARPATTERNDECOMPTOGHOST" )

      IF (present(T)) THEN
        DataType = T
      ELSE
        DataType = CPP_MPI_REAL8
      ENDIF

      IF (present(mod_method)) THEN
        method = mod_method
      ELSE
        method = 0     ! Default method - see mod_comm for description
      ENDIF

! Assume this routine is called by processes [ 0,max(NpesA,NpesB) )

      CALL DecompInfo( DA, NpesA, TotalPtsA )
      CALL GhostInfo( GB, NpesB, GlobalSizeB, LocalSizeB, BorderSizeB )

      CALL MPI_COMM_SIZE( InComm, GroupSize, Ierror )
      CALL MPI_COMM_RANK( InComm, Iam, Ierror )
      CALL MPI_COMM_DUP( InComm, Pattern%Comm, Ierror )

#ifdef _SMEMORY
! Calculate info about the pattern 
      call ParCalcInfo(InComm,DA,GB, Info)
      TotalPtsA=Info%maxNumSeg
      GlobalSizeB=Info%maxNumSeg
#endif

      Pattern%Size = GroupSize
      Pattern%Iam  = Iam
!
! Allocate the number of entries and list head arrays
!

!
! Allocate the patterns
!
      ALLOCATE( Pattern%SendDesc( NpesB ) )
      Pattern%SendDesc(:)%method = method
      if (iam .ge. NpesA) then
         do ipe = 1, NpesB
            ALLOCATE( Pattern%SendDesc(ipe)%Displacements(1) )
            ALLOCATE( Pattern%SendDesc(ipe)%BlockSizes(1) )
            Pattern%SendDesc(ipe)%Tot_Size = -1
            Pattern%SendDesc(ipe)%Nparcels = -1
            Pattern%SendDesc(ipe)%type = MPI_DATATYPE_NULL
            Pattern%SendDesc(ipe)%Displacements(1) = -1
            Pattern%SendDesc(ipe)%Blocksizes(1) = -1
         enddo
      endif

      ALLOCATE( Pattern%RecvDesc( NpesA ) )
      Pattern%RecvDesc(:)%method = method
      if (iam .ge. NpesB) then
         do ipe = 1, NpesA
            ALLOCATE( Pattern%RecvDesc(ipe)%Displacements(1) )
            ALLOCATE( Pattern%RecvDesc(ipe)%BlockSizes(1) )
            Pattern%RecvDesc(ipe)%Tot_Size = -1
            Pattern%RecvDesc(ipe)%Nparcels = -1
            Pattern%RecvDesc(ipe)%type = MPI_DATATYPE_NULL
            Pattern%RecvDesc(ipe)%Displacements(1) = -1
            Pattern%RecvDesc(ipe)%Blocksizes(1) = -1
         enddo
      endif

!
! Local allocations
!
      ALLOCATE( DisplacementsA( TotalPtsA ) )   ! Allocate for worst case
      ALLOCATE( BlockSizesA( TotalPtsA ) )      ! Allocate for worst case
      ALLOCATE( LocalA( TotalPtsA ) )           ! Allocate for worst case

      ALLOCATE( DisplacementsB( GlobalSizeB ) ) ! Allocate for worst case
      ALLOCATE( BlockSizesB( GlobalSizeB ) )    ! Allocate for worst case
      ALLOCATE( LocalB( GlobalSizeB ) )         ! Allocate for worst case
      ALLOCATE( PeB( GlobalSizeB ) )            ! Allocate for worst case

      ALLOCATE( Count( GroupSize ) )
      ALLOCATE( CountOut( GroupSize ) )

      JB        = 0
      Count     = 0
      LenB      = 0
      LocalA      = 0   !  (needed for parexchangevector later)
      BlocksizesA = 0   !  (needed for parexchangevector later)

      Num    = 0
      Inc    = 0

    if (iam .lt. NpesB) then

!
! Parse through all the tags in the local segment
      DO J = 1, SIZE( GB%Local%Head(iam+1)%StartTags )
        OldPe     = -1         ! Set PE undefined
        OldLocal  =  0         ! Set local index undefined
        DO Tag=GB%Local%Head(iam+1)%StartTags(J),                         &
                GB%Local%Head(iam+1)%EndTags(J)
          IF ( Tag > 0 ) THEN        ! Active point
!
! Determine the index and PE of this entry on A. This might be inlined later
!
            CALL DecompGlobalToLocal( DA, Tag, Local, Pe )

!
! If ipe-1 is my id, then this is an entry ipe will receive from Pe
!
            IF ( Pe /= OldPe .OR. Local /= OldLocal+1 ) THEN
              IF ( jb > 0 ) THEN
                BlockSizesB(jb) = LenB
                LenB = 0
              ENDIF
              jb = jb+1                     ! increment the segment index
              DisplacementsB(jb) = Inc      ! Zero-based offset of local segment
              LocalB(jb) = Local-1          ! Local indices (zero-based)
              PeB(jb) = Pe                  ! Note the ID of the sender
              Count(Pe+1) = Count(Pe+1)+1 ! Increment counter of segments
            ENDIF
            OldPe   = Pe                    ! Update PE
            OldLocal= Local                 ! Update local index
            LenB = LenB+1                   ! Good -- segment is getting longer
          ENDIF
          Inc = Inc+1                     ! Increment local index
        ENDDO
      ENDDO
!
! Clean up
!
      IF ( jb>0 ) BlockSizesB(jb) = LenB

      CPP_ASSERT_F90( JB .LE. GlobalSize )
!
! Now create the pattern from the displacements and block sizes
!
      Inc = 0
      DO ipe = 1, NpesA
!
! Find the segments which are relevant for the sender ipe
! Make compact arrays BlockSizes and Displacements 
!
        DO j = 1, jb
          IF ( PeB(j) == ipe-1 ) THEN
            Inc = Inc + 1
            BlockSizesA(Inc) = BlockSizesB(j)
            DisplacementsA(Inc) = DisplacementsB(j)
            LocalA(Inc)      = LocalB(j)
          ENDIF
        ENDDO
      ENDDO

      CPP_ASSERT_F90( Inc .LE. TotalPtsA )

      Off = 0
      DO ipe = 1, NpesA
        Num = Count(ipe)
#if defined( DEBUG_PARPATTERNDECOMPTOGHOST )
        write(iulog,*) "Receiver Iam", Iam, "Ipe", Ipe-1, "Num", Num, &
                 "Displacements", DisplacementsA(Off+1:Off+Num), &
                 "BlockSizes", BlockSizesA(Off+1:Off+Num)
#endif

!
! Create the receiver communication pattern
!
          IF ( Num > 0 .and. method > 0 ) THEN
            CALL MPI_TYPE_INDEXED( Num, BlockSizesA(Off+1),        &
                   DisplacementsA(Off+1), DataType, Ptr, Ierror )
            CALL MPI_TYPE_COMMIT( Ptr, Ierror )
            Pattern%RecvDesc(ipe)%type = Ptr
          ELSE
            Pattern%RecvDesc(ipe)%type = MPI_DATATYPE_NULL
          ENDIF

          ALLOCATE( Pattern%RecvDesc(ipe)%Displacements(Num) )
          ALLOCATE( Pattern%RecvDesc(ipe)%BlockSizes(Num) )
          DO i=1, Num
            Pattern%RecvDesc(ipe)%Displacements(i) = DisplacementsA(i+Off)
            Pattern%RecvDesc(ipe)%BlockSizes(i)    = BlockSizesA(i+Off)
          ENDDO
          Pattern%RecvDesc(ipe)%Nparcels  = &
            size (Pattern%RecvDesc(ipe)%Displacements)
          Pattern%RecvDesc(ipe)%Tot_Size = &
            sum  (Pattern%RecvDesc(ipe)%Blocksizes)
          Max_Nparcels = max (Max_Nparcels, Pattern%RecvDesc(ipe)%Nparcels)

        Off = Off + Num
      ENDDO

    endif !  (iam .lt. NpesB)

!
! Now communicate what the receiver is expecting to the sender
!
      CALL ParExchangeVectorInt( InComm, Count, LocalA,                 &
                                 CountOut, DisplacementsB  )
      CALL ParExchangeVectorInt( InComm, Count, BlockSizesA,            &
                                 CountOut, BlockSizesB )

!
! Sender A: BlockSizes and Displacements can now be stored
!

    if (iam .lt. NpesA) then

      Off = 0
      DO ipe=1, NpesB
        Num = CountOut(ipe)
#if defined( DEBUG_PARPATTERNDECOMPTOGHOST )
        write(iulog,*) "Sender Iam", Iam, "Ipe", Ipe-1, "Num", Num,           &
                 "Displacements", DisplacementsB(Off+1:Off+Num),        &
                 "BlockSizes", BlockSizesB(Off+1:Off+Num)
#endif

          IF ( Num > 0 .and. method > 0 ) THEN
            CALL MPI_TYPE_INDEXED( Num, BlockSizesB(Off+1),          &
                    DisplacementsB(Off+1), DataType, Ptr, Ierror )
            CALL MPI_TYPE_COMMIT( Ptr, Ierror )
            Pattern%SendDesc(ipe)%type = Ptr
          ELSE
            Pattern%SendDesc(ipe)%type = MPI_DATATYPE_NULL
          ENDIF

          ALLOCATE( Pattern%SendDesc(ipe)%Displacements(Num) )
          ALLOCATE( Pattern%SendDesc(ipe)%BlockSizes(Num) )
          DO i=1, Num
            Pattern%SendDesc(ipe)%Displacements(i) = DisplacementsB(i+Off)
            Pattern%SendDesc(ipe)%BlockSizes(i)    = BlockSizesB(i+Off)
          ENDDO
          Pattern%SendDesc(ipe)%Nparcels  = &
            size (Pattern%SendDesc(ipe)%Displacements)
          Pattern%SendDesc(ipe)%Tot_Size = &
            sum  (Pattern%SendDesc(ipe)%Blocksizes)
          Max_Nparcels = max (Max_Nparcels, Pattern%SendDesc(ipe)%Nparcels)

        Off = Off + Num
      ENDDO

    endif !  (iam .lt. NpesA)

      CALL get_partneroffset( InComm, Pattern%SendDesc, Pattern%RecvDesc )
      
      DEALLOCATE( CountOut )
      DEALLOCATE( Count )

      DEALLOCATE( PeB )
      DEALLOCATE( LocalB )
      DEALLOCATE( BlockSizesB )
      DEALLOCATE( DisplacementsB )

      DEALLOCATE( LocalA )
      DEALLOCATE( BlockSizesA )
      DEALLOCATE( DisplacementsA )

      CPP_LEAVE_PROCEDURE( "PARPATTERNDECOMPTOGHOST" )
      RETURN
!EOC
      END SUBROUTINE ParPatternDecompToGhost
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE:   ParPatternGhostToDecomp --- Create pattern between decomps
!
! !INTERFACE:

      SUBROUTINE ParPatternGhostToDecomp( InComm, GA, DB, Pattern, mod_method, T ) 1,11
!
! !USES:
      USE decompmodule, ONLY : DecompType, DecompGlobalToLocal, DecompInfo
      USE ghostmodule, ONLY : GhostType, GhostInfo
      USE mod_comm, ONLY : get_partneroffset
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER,  INTENT( IN )               :: InComm  ! # of PEs
      TYPE(GhostType),   INTENT( IN )      :: GA      ! Source Decomp Desc
      TYPE(DecompType),  INTENT( IN )      :: DB      ! Target Decomp Desc
      INTEGER,  INTENT( IN ), OPTIONAL     :: mod_method ! contiguous or derived type
      INTEGER, INTENT( IN ),  OPTIONAL     :: T       !
! !OUTPUT PARAMETERS:
      TYPE(ParPatternType), INTENT( OUT )  :: Pattern ! Comm Pattern
!
! !DESCRIPTION:
!     This routine contructs a communication pattern for a 
!     transformation from one ghosted decomposition to partitioned
!     one, i.e., a so-called "transpose". The resulting communication 
!     pattern can be used in ParBegin/EndTransfer with the decomposed 
!     arrays as inputs.  
!
! !SYSTEM ROUTINES:
!    MPI_COMM_SIZE,  MPI_COMM_RANK, MPI_COMM_DUP
!    MPI_TYPE_INDEXED, MPI_TYPE_COMMIT (depending on method)
!
! !REVISION HISTORY:
!   02.01.10   Sawyer     Creation from DecompToDecomp
!   02.07.16   Sawyer     Added data type T
!   03.11.11   Mirin      Added optional argument mod_method
!   07.03.11   Mirin      Generalized to different sized decompositions
!   07.09.04   Dennis     Reduced amount of temporary memory usage
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER I, J, Tag, Local, Pe, Len, JA, Ipe, Num, Inc, Off
      INTEGER NpesA, GlobalSizeA, LocalSizeA, BorderSizeA
      INTEGER OldPe, OldLocal, TotalPtsB, NpesB
      INTEGER GroupSize, Iam, Ierror
      INTEGER Ptr                                ! Pointer type
      INTEGER  DataType
      INTEGER              :: method
      INTEGER              :: nCount, maxCount, ierr
#ifdef _SMEMORY
      TYPE (ParInfoType) :: Info
#endif

      INTEGER, ALLOCATABLE :: Count(:)           ! # segments for each recv PE
      INTEGER, ALLOCATABLE :: CountOut(:)        ! # segments for each send PE

      INTEGER, ALLOCATABLE :: DisplacementsA(:)  ! Generic displacements
      INTEGER, ALLOCATABLE :: BlockSizesA(:)     ! Generic block sizes
      INTEGER, ALLOCATABLE :: GlobalA(:)          ! Generic Local indices
      INTEGER, ALLOCATABLE :: PeA(:)             ! Processor element numbers

      INTEGER, ALLOCATABLE :: DisplacementsB(:)  ! Displacements for B
      INTEGER, ALLOCATABLE :: BlockSizesB(:)     ! Block sizes for B
      INTEGER, ALLOCATABLE :: GlobalB(:)         ! Global indices for B

      CPP_ENTER_PROCEDURE( "PARPATTERNGHOSTTODECOMP" )

      IF (present(T)) THEN
        DataType = T
      ELSE
        DataType = CPP_MPI_REAL8
      ENDIF

      IF (present(mod_method)) THEN
        method = mod_method
      ELSE
        method = 0     ! Default method - see mod_comm for description
      ENDIF

! Assume this routine is called by processes [ 0,max(NpesA,NpesB) )

      CALL GhostInfo( GA, NpesA, GlobalSizeA, LocalSizeA, BorderSizeA )
      CALL DecompInfo( DB, NpesB, TotalPtsB )

      CALL MPI_COMM_SIZE( InComm, GroupSize, Ierror )
      CALL MPI_COMM_RANK( InComm, Iam, Ierror )
      CALL MPI_COMM_DUP( InComm, Pattern%Comm, Ierror )

#ifdef _SMEMORY
! Calculate info about the pattern 
      call ParCalcInfo(InComm,GA,DB, Info)
      GlobalSizeA=Info%maxNumSeg
      TotalPtsB=Info%maxNumSeg
#endif

      Pattern%Size = GroupSize
      Pattern%Iam  = Iam
!
! Allocate the number of entries and list head arrays
!

!
! Allocate the patterns
!
      ALLOCATE( Pattern%SendDesc( NpesB ) )
      Pattern%SendDesc(:)%method = method
      if (iam .ge. NpesA) then
         do ipe = 1, NpesB
            ALLOCATE( Pattern%SendDesc(ipe)%Displacements(1) )
            ALLOCATE( Pattern%SendDesc(ipe)%BlockSizes(1) )
            Pattern%SendDesc(ipe)%Tot_Size = -1
            Pattern%SendDesc(ipe)%Nparcels = -1
            Pattern%SendDesc(ipe)%type = MPI_DATATYPE_NULL
            Pattern%SendDesc(ipe)%Displacements(1) = -1
            Pattern%SendDesc(ipe)%Blocksizes(1) = -1
         enddo
      endif

      ALLOCATE( Pattern%RecvDesc( NpesA ) )
      Pattern%RecvDesc(:)%method = method
      if (iam .ge. NpesB) then
         do ipe = 1, NpesA
            ALLOCATE( Pattern%RecvDesc(ipe)%Displacements(1) )
            ALLOCATE( Pattern%RecvDesc(ipe)%BlockSizes(1) )
            Pattern%RecvDesc(ipe)%Tot_Size = -1
            Pattern%RecvDesc(ipe)%Nparcels = -1
            Pattern%RecvDesc(ipe)%type = MPI_DATATYPE_NULL
            Pattern%RecvDesc(ipe)%Displacements(1) = -1
            Pattern%RecvDesc(ipe)%Blocksizes(1) = -1
         enddo
      endif

!
! Local allocations
!
      ALLOCATE( DisplacementsA( GlobalSizeA ) )   ! Allocate for worst case
      ALLOCATE( BlockSizesA( GlobalSizeA ) )      ! Allocate for worst case
      ALLOCATE( GlobalA( GlobalSizeA ) )          ! Allocate for worst case
      ALLOCATE( PeA( GlobalSizeA ) )              ! Allocate for worst case

      ALLOCATE( DisplacementsB( TotalPtsB ) )     ! Allocate for worst case
      ALLOCATE( BlockSizesB( TotalPtsB ) )        ! Allocate for worst case
      ALLOCATE( GlobalB( TotalPtsB ) )            ! Allocate for worst case

      ALLOCATE( Count( GroupSize ) )
      ALLOCATE( CountOut( GroupSize ) )

      JA     = 0
      Count  = 0
      Len    = 0
      GlobalB     = 0   !  (needed for parexchangevector later)
      BlockSizesB = 0   !  (needed for parexchangevector later)

      Num    = 0
      Inc    = 0

    if (iam .lt. NpesB) then

!
! Parse through all the tags in the local segment
      DO J = 1, SIZE( DB%Head(iam+1)%StartTags )
        OldPe     = -1         ! Set PE undefined
        OldLocal  = 0          ! Set index value undefined
        DO Tag=DB%Head(iam+1)%StartTags(J), DB%Head(iam+1)%EndTags(J)
!
! Determine the index and PE of this entry on A. This might be inlined later
!
          CALL DecompGlobalToLocal( GA%Decomp, Tag, Local, Pe )

!
! If ipe-1 is my id, then this is an entry ipe will receive from Pe
!
          IF ( Pe /= OldPe  .OR. Local /= OldLocal+1 ) THEN
            IF ( ja > 0 ) THEN
              BlockSizesA(ja) = Len
              Len = 0
            ENDIF
            ja = ja+1                     ! increment the segment index
            DisplacementsA(ja) = Inc      ! Zero-based offset of local segment
            GlobalA(ja) = Tag             ! The global tag of the desired datum
            PeA(ja) = Pe                  ! Note the ID of the sender
            Count(Pe+1) = Count(Pe+1)+1   ! Increment counter of segments
          ENDIF
          OldPe    = Pe                   ! Update old PE
          OldLocal = Local                ! Update old local index
          Len = Len+1                     ! Good -- segment is getting longer
          Inc = Inc+1                     ! Increment local index
        ENDDO
      ENDDO
!
! Clean up
!
      BlockSizesA(ja) = Len
      CPP_ASSERT_F90( JA .LE. GlobalSizeA )
!
! Now create the pattern from the displacements and block sizes
!
      Inc = 0
      DO ipe = 1, NpesA
!
! Find the segments which are relevant for the sender ipe
! Make compact arrays BlockSizes and Displacements 
!
        DO j = 1, ja
          IF ( PeA(j) == ipe-1 ) THEN
            Inc = Inc + 1
            BlockSizesB(Inc) = BlockSizesA(j)
            DisplacementsB(Inc) = DisplacementsA(j)
            GlobalB(Inc)      = GlobalA(j)
          ENDIF
        ENDDO
      ENDDO

     CPP_ASSERT_F90(Inc .LE. TotalPtsB)

!
! Create the receiver communication pattern
!
      Off = 0
      DO ipe = 1, NpesA
        Num = Count(ipe)
#if defined( DEBUG_PARPATTERNGHOSTTODECOMP )
        write(iulog,*) "Receiver Iam", Iam, "Ipe", Ipe-1, "Num", Num, &
                 "Displacements", DisplacementsB(Off+1:Off+Num), &
                 "BlockSizes", BlockSizesB(Off+1:Off+Num)
#endif

          IF ( Num > 0 .and. method > 0 ) THEN
            CALL MPI_TYPE_INDEXED( Num, BlockSizesB(Off+1),       &
               DisplacementsB(Off+1), DataType, Ptr, Ierror )
            CALL MPI_TYPE_COMMIT( Ptr, Ierror )
            Pattern%RecvDesc(ipe)%type = Ptr
          ELSE
            Pattern%RecvDesc(ipe)%type = MPI_DATATYPE_NULL
          ENDIF

          ALLOCATE( Pattern%RecvDesc(ipe)%Displacements(Num) )
          ALLOCATE( Pattern%RecvDesc(ipe)%BlockSizes(Num) )
          DO i=1, Num
            Pattern%RecvDesc(ipe)%Displacements(i) = DisplacementsB(i+Off)
            Pattern%RecvDesc(ipe)%BlockSizes(i)    = BlockSizesB(i+Off)
          ENDDO
          Pattern%RecvDesc(ipe)%Nparcels  = &
            size (Pattern%RecvDesc(ipe)%Displacements)
          Pattern%RecvDesc(ipe)%Tot_Size = &
            sum  (Pattern%RecvDesc(ipe)%Blocksizes)
          Max_Nparcels = max (Max_Nparcels, Pattern%RecvDesc(ipe)%Nparcels)

        Off = Off + Num
      ENDDO

    endif !  (iam .lt. NpesB)

!
! Now communicate what the receiver is expecting to the sender
!
      CALL ParExchangeVectorInt( InComm, Count, GlobalB,                 &
                                 CountOut, GlobalA  )
      CALL ParExchangeVectorInt( InComm, Count, BlockSizesB,            &
                                 CountOut, BlockSizesA )

    if (iam .lt. NpesA) then

!
! Sender A: BlockSizes and Displacements can now be stored
!
      Off = 0
      DO ipe=1, NpesB
        Num = CountOut(ipe)
        DO i=1, Num
          CALL DecompGlobalToLocal( GA%Local, GlobalA(i+Off), Local, Pe )
          DisplacementsA(i+Off) = Local-1    ! zero-based displacement
        ENDDO
#if defined( DEBUG_PARPATTERNGHOSTTODECOMP )
        write(iulog,*) "Sender Iam", Iam, "Ipe", Ipe-1, "Num", Num,  &
                 "Displacements", DisplacementsA(Off+1:Off+Num), &
                 "BlockSizes", BlockSizesA(Off+1:Off+Num)
#endif

          IF ( Num > 0 .and. method > 0 ) THEN
            CALL MPI_TYPE_INDEXED( Num, BlockSizesA(Off+1),        &
                    DisplacementsA(Off+1), DataType, Ptr, Ierror )
            CALL MPI_TYPE_COMMIT( Ptr, Ierror )
            Pattern%SendDesc(ipe)%type = Ptr
          ELSE
            Pattern%SendDesc(ipe)%type = MPI_DATATYPE_NULL
          ENDIF

          ALLOCATE( Pattern%SendDesc(ipe)%Displacements(Num) )
          ALLOCATE( Pattern%SendDesc(ipe)%BlockSizes(Num) )
          DO i=1, Num
            Pattern%SendDesc(ipe)%Displacements(i) = DisplacementsA(i+Off)
            Pattern%SendDesc(ipe)%BlockSizes(i)    = BlockSizesA(i+Off)
          ENDDO
          Pattern%SendDesc(ipe)%Nparcels  = &
            size (Pattern%SendDesc(ipe)%Displacements)
          Pattern%SendDesc(ipe)%Tot_Size = &
            sum  (Pattern%SendDesc(ipe)%Blocksizes)
          Max_Nparcels = max (Max_Nparcels, Pattern%SendDesc(ipe)%Nparcels)

        Off = Off + Num
      ENDDO

    endif !  (iam .lt. NpesA)

      CALL get_partneroffset( InComm, Pattern%SendDesc, Pattern%RecvDesc )
      
      DEALLOCATE( CountOut )
      DEALLOCATE( Count )

      DEALLOCATE( PeA )
      DEALLOCATE( GlobalA )
      DEALLOCATE( BlockSizesA )
      DEALLOCATE( DisplacementsA )

      DEALLOCATE( GlobalB )
      DEALLOCATE( BlockSizesB )
      DEALLOCATE( DisplacementsB )

      CPP_LEAVE_PROCEDURE( "PARPATTERNGHOSTTODECOMP" )
      RETURN
!EOC
      END SUBROUTINE ParPatternGhostToDecomp
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !IROUTINE:   ParPatternGhostToGhost --- Create pattern between decomps
!
! !INTERFACE:

      SUBROUTINE ParPatternGhostToGhost( InComm, GA, GB, Pattern, mod_method, T ) 1,11
!
! !USES:
      USE decompmodule, ONLY : DecompGlobalToLocal
      USE ghostmodule, ONLY : GhostType, GhostInfo
      USE mod_comm, ONLY : get_partneroffset
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER,  INTENT( IN )               :: InComm  ! # of PEs
      TYPE(GhostType),   INTENT( IN )      :: GA      ! Source Ghost Decomp
      TYPE(GhostType),   INTENT( IN )      :: GB      ! Target Ghost Decomp
      INTEGER,  INTENT( IN ), OPTIONAL     :: mod_method ! contiguous or derived type
      INTEGER, INTENT( IN ),  OPTIONAL     :: T       !
! !OUTPUT PARAMETERS:
      TYPE(ParPatternType), INTENT( OUT )  :: Pattern ! Comm Pattern
!
! !DESCRIPTION:
!     This routine contructs a communication pattern for a 
!     transformation from one ghosted decomposition to partitioned
!     one, i.e., a so-called "transpose". The resulting communication 
!     pattern can be used in ParBegin/EndTransfer with the decomposed 
!     arrays as inputs.  
!
! !SYSTEM ROUTINES:
!    MPI_COMM_SIZE,  MPI_COMM_RANK, MPI_COMM_DUP
!    MPI_TYPE_INDEXED, MPI_TYPE_COMMIT (depending on method)
!
! !REVISION HISTORY:
!   02.01.10   Sawyer     Creation from DecompToDecomp
!   02.07.16   Sawyer     Added data type T
!   03.11.11   Mirin      Added optional argument mod_method
!   07.03.11   Mirin      Generalized to different sized decompositions
!   07.09.04   Dennis     Reduced amount of temporary memory usage
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER I, J, Tag, Local, Pe, Len, JA, Ipe, Num, Inc, Off
      INTEGER NpesA, GlobalSizeA, LocalSizeA, BorderSizeA
      INTEGER NpesB, GlobalSizeB, LocalSizeB, BorderSizeB
      INTEGER GroupSize, Iam, Ierror, OldPe, OldLocal 
      INTEGER Ptr                                ! Pointer type
      INTEGER  DataType
      INTEGER              :: method
      INTEGER              :: nCount, maxCount, ierr
#ifdef _SMEMORY
      TYPE (ParInfoType) :: Info
#endif

      INTEGER, ALLOCATABLE :: Count(:)           ! # segments for each recv PE
      INTEGER, ALLOCATABLE :: CountOut(:)        ! # segments for each send PE

      INTEGER, ALLOCATABLE :: DisplacementsA(:)  ! Generic displacements
      INTEGER, ALLOCATABLE :: BlockSizesA(:)     ! Generic block sizes
      INTEGER, ALLOCATABLE :: GlobalA(:)         ! Generic Local indices
      INTEGER, ALLOCATABLE :: PeA(:)             ! Processor element numbers

      INTEGER, ALLOCATABLE :: DisplacementsB(:)  ! Displacements for B
      INTEGER, ALLOCATABLE :: BlockSizesB(:)     ! Block sizes for B
      INTEGER, ALLOCATABLE :: GlobalB(:)         ! Global indices for B

      CPP_ENTER_PROCEDURE( "PARPATTERNGHOSTTOGHOST" )

      IF (present(T)) THEN
        DataType = T
      ELSE
        DataType = CPP_MPI_REAL8
      ENDIF

      IF (present(mod_method)) THEN
        method = mod_method
      ELSE
        method = 0     ! Default method - see mod_comm for description
      ENDIF

! Assume this routine is called by processes [ 0,max(NpesA,NpesB) )

      CALL GhostInfo( GA, NpesA, GlobalSizeA, LocalSizeA, BorderSizeA )
      CALL GhostInfo( GB, NpesB, GlobalSizeB, LocalSizeB, BorderSizeB )

      CALL MPI_COMM_SIZE( InComm, GroupSize, Ierror )
      CALL MPI_COMM_RANK( InComm, Iam, Ierror )
      CALL MPI_COMM_DUP( InComm, Pattern%Comm, Ierror )

#ifdef _SMEMORY
! Calculate info about the pattern 
      call ParCalcInfo(InComm,GA,GB, Info)
      GlobalSizeA=Info%maxNumSeg
      GlobalSizeB=Info%maxNumSeg
#endif

      Pattern%Size = GroupSize
      Pattern%Iam  = Iam
!
! Allocate the number of entries and list head arrays
!

!
! Allocate the patterns
!
      ALLOCATE( Pattern%SendDesc( NpesB ) )
      Pattern%SendDesc(:)%method = method
      if (iam .ge. NpesA) then
         do ipe = 1, NpesB
            ALLOCATE( Pattern%SendDesc(ipe)%Displacements(1) )
            ALLOCATE( Pattern%SendDesc(ipe)%BlockSizes(1) )
            Pattern%SendDesc(ipe)%Tot_Size = -1
            Pattern%SendDesc(ipe)%Nparcels = -1
            Pattern%SendDesc(ipe)%type = MPI_DATATYPE_NULL
            Pattern%SendDesc(ipe)%Displacements(1) = -1
            Pattern%SendDesc(ipe)%Blocksizes(1) = -1
         enddo
      endif

      ALLOCATE( Pattern%RecvDesc( NpesA ) )
      Pattern%RecvDesc(:)%method = method
      if (iam .ge. NpesB) then
         do ipe = 1, NpesA
            ALLOCATE( Pattern%RecvDesc(ipe)%Displacements(1) )
            ALLOCATE( Pattern%RecvDesc(ipe)%BlockSizes(1) )
            Pattern%RecvDesc(ipe)%Tot_Size = -1
            Pattern%RecvDesc(ipe)%Nparcels = -1
            Pattern%RecvDesc(ipe)%type = MPI_DATATYPE_NULL
            Pattern%RecvDesc(ipe)%Displacements(1) = -1
            Pattern%RecvDesc(ipe)%Blocksizes(1) = -1
         enddo
      endif

!
! Local allocations
!
      ALLOCATE( DisplacementsA( GlobalSizeA ) )   ! Allocate for worst case
      ALLOCATE( BlockSizesA( GlobalSizeA ) )      ! Allocate for worst case
      ALLOCATE( GlobalA( GlobalSizeA ) )          ! Allocate for worst case
      ALLOCATE( PeA( GlobalSizeA ) )              ! Allocate for worst case

      ALLOCATE( DisplacementsB( GlobalSizeB ) )   ! Allocate for worst case
      ALLOCATE( BlockSizesB( GlobalSizeB ) )      ! Allocate for worst case
      ALLOCATE( GlobalB( GlobalSizeB ) )          ! Allocate for worst case

      ALLOCATE( Count( GroupSize ) )
      ALLOCATE( CountOut( GroupSize ) )

      JA        = 0
      Count     = 0
      Len      = 0
      GlobalB     = 0   !  (needed for parexchangevector later)
      BlocksizesB = 0   !  (needed for parexchangevector later)

      Num    = 0
      Inc    = 0

    if (iam .lt. NpesB) then

!
! Parse through all the tags in the local segment
      DO J = 1, SIZE( GB%Local%Head(iam+1)%StartTags )
        OldPe     = -1         ! Set PE undefined
        OldLocal  = 0          ! Set index value undefined
        DO Tag=GB%Local%Head(iam+1)%StartTags(J), GB%Local%Head(iam+1)%EndTags(J)
          IF ( Tag > 0 ) THEN       ! Active point
!
! Determine the index and PE of this entry on A. This might be inlined later
!
            CALL DecompGlobalToLocal( GA%Decomp, Tag, Local, Pe )
!
! If ipe-1 is my id, then this is an entry ipe will receive from Pe
!
            IF ( Pe /= OldPe  .OR. Local /= OldLocal+1 ) THEN
              IF ( ja > 0 ) THEN
                BlockSizesA(ja) = Len
                Len = 0
              ENDIF
              ja = ja+1                     ! increment the segment index
              DisplacementsA(ja) = Inc      ! Zero-based offset of local segment
              GlobalA(ja) = Tag             ! The global tag of the desired datum
              PeA(ja) = Pe                  ! Note the ID of the sender
              Count(Pe+1) = Count(Pe+1)+1   ! Increment counter of segments
            ENDIF
            OldPe   = Pe                    ! Update old PE
            OldLocal = Local                ! Update old local index
            Len = Len+1                     ! Good -- segment is getting longer
          ENDIF
          Inc = Inc+1                       ! Increment local index
        ENDDO
      ENDDO
!
! Clean up
!
      BlockSizesA(ja) = Len

      CPP_ASSERT_F90( JA .LE. GlobalSizeA )

!
! Now create the pattern from the displacements and block sizes
!
      Inc = 0
      DO ipe = 1, NpesA
!
! Find the segments which are relevant for the sender ipe
! Make compact arrays BlockSizes and Displacements 
!
        DO j = 1, ja
          IF ( PeA(j) == ipe-1 ) THEN
            Inc = Inc + 1
            BlockSizesB(Inc) = BlockSizesA(j)
            DisplacementsB(Inc) = DisplacementsA(j)
            GlobalB(Inc)      = GlobalA(j)
          ENDIF
        ENDDO
      ENDDO
      CPP_ASSERT_F90( Inc .LE. GlobalSizeB )

!
! Create the receiver communication pattern
!
      Off = 0
      DO ipe = 1, NpesA
        Num = Count(ipe)
#if defined(DEBUG_PARPATTERNGHOSTTOGHOST)
        write(iulog,*) "Receiver Iam", Iam, "Ipe", Ipe-1, "Num", Num, &
                 "Displacements", DisplacementsB(Off+1:Off+Num), &
                 "BlockSizes", BlockSizesB(Off+1:Off+Num)
#endif

          IF ( Num > 0 .and. method > 0 ) THEN
            CALL MPI_TYPE_INDEXED( Num, BlockSizesB(Off+1),         &
                 DisplacementsB(Off+1), DataType, Ptr, Ierror )
            CALL MPI_TYPE_COMMIT( Ptr, Ierror )
            Pattern%RecvDesc(ipe)%type = Ptr
          ELSE
            Pattern%RecvDesc(ipe)%type = MPI_DATATYPE_NULL
          ENDIF

          ALLOCATE( Pattern%RecvDesc(ipe)%Displacements(Num) )
          ALLOCATE( Pattern%RecvDesc(ipe)%BlockSizes(Num) )
          DO i=1, Num
            Pattern%RecvDesc(ipe)%Displacements(i) = DisplacementsB(i+Off)
            Pattern%RecvDesc(ipe)%BlockSizes(i)    = BlockSizesB(i+Off)
          ENDDO
          Pattern%RecvDesc(ipe)%Nparcels  = &
            size (Pattern%RecvDesc(ipe)%Displacements)
          Pattern%RecvDesc(ipe)%Tot_Size = &
            sum  (Pattern%RecvDesc(ipe)%Blocksizes)
          Max_Nparcels = max (Max_Nparcels, Pattern%RecvDesc(ipe)%Nparcels)

        Off = Off + Num
      ENDDO

    endif !  (iam .lt. NpesB)

!
! Now communicate what the receiver is expecting to the sender
!
      CALL ParExchangeVectorInt( InComm, Count, GlobalB,                &
                                 CountOut, GlobalA  )
      CALL ParExchangeVectorInt( InComm, Count, BlockSizesB,            &
                                 CountOut, BlockSizesA )

    if (iam .lt. NpesA) then

!
! Sender A: BlockSizes and Displacements can now be stored
!
      Off = 0
      DO ipe=1, NpesB
        Num = CountOut(ipe)
        DO i=1, Num
          CALL DecompGlobalToLocal( GA%Local, GlobalA(i+Off), Local, Pe )
          DisplacementsA(i+Off) = Local-1    ! zero-based displacement
        ENDDO
#if defined(DEBUG_PARPATTERNGHOSTTOGHOST)
        write(iulog,*) "Sender Iam", Iam, "Ipe", Ipe-1, "Num", Num,  &
                 "Displacements", DisplacementsA(Off+1:Off+Num), &
                 "BlockSizes", BlockSizesA(Off+1:Off+Num)
#endif

          IF ( Num > 0 .and. method > 0 ) THEN
            CALL MPI_TYPE_INDEXED( Num, BlockSizesA(Off+1),        &
                 DisplacementsA(Off+1), DataType, Ptr, Ierror )
            CALL MPI_TYPE_COMMIT( Ptr, Ierror )
            Pattern%SendDesc(ipe)%type = Ptr
          ELSE
            Pattern%SendDesc(ipe)%type = MPI_DATATYPE_NULL
          ENDIF

          ALLOCATE( Pattern%SendDesc(ipe)%Displacements(Num) )
          ALLOCATE( Pattern%SendDesc(ipe)%BlockSizes(Num) )
          DO i=1, Num
            Pattern%SendDesc(ipe)%Displacements(i) = DisplacementsA(i+Off)
            Pattern%SendDesc(ipe)%BlockSizes(i)    = BlockSizesA(i+Off)
          ENDDO
          Pattern%SendDesc(ipe)%Nparcels  = &
            size (Pattern%SendDesc(ipe)%Displacements)
          Pattern%SendDesc(ipe)%Tot_Size = &
            sum  (Pattern%SendDesc(ipe)%Blocksizes)
          Max_Nparcels = max (Max_Nparcels, Pattern%SendDesc(ipe)%Nparcels)

        Off = Off + Num
      ENDDO

    endif !  (iam .lt. NpesA)

      CALL get_partneroffset( InComm, Pattern%SendDesc, Pattern%RecvDesc )


      DEALLOCATE( CountOut )
      DEALLOCATE( Count )

      DEALLOCATE( PeA )
      DEALLOCATE( GlobalA )
      DEALLOCATE( BlockSizesA )
      DEALLOCATE( DisplacementsA )

      DEALLOCATE( GlobalB )
      DEALLOCATE( BlockSizesB )
      DEALLOCATE( DisplacementsB )

      CPP_LEAVE_PROCEDURE( "PARPATTERNGHOSTTOGHOST" )
      RETURN
!EOC
      END SUBROUTINE ParPatternGhostToGhost
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !IROUTINE:   ParPatternFree --- Free the communication pattern
!
! !INTERFACE:

      SUBROUTINE ParPatternFree( InComm, Pattern ) 17
!
! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER,  INTENT( IN )                 :: InComm  ! # of PEs
! !INPUT/OUTPUT PARAMETERS:
      TYPE(ParPatternType), INTENT( INOUT )  :: Pattern ! Comm Pattern
!
! !DESCRIPTION:
!     This routine frees a communication pattern.  
!
! !SYSTEM ROUTINES:
!     MPI_TYPE_FREE
!
! !BUGS:
!     The MPI_TYPE_FREE statement does not seem to work with FFC
!
! !REVISION HISTORY:
!   01.02.10   Sawyer     Creation
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER  ipe, GroupSize, Pointer, Ierror, method

      CPP_ENTER_PROCEDURE( "PARPATTERNFREE" )

      method = Pattern%RecvDesc(1)%method

!
! First request the needed ghost values from other processors.
!
! Free all the MPI derived types
!
      DO ipe=1, Pattern%Size
        Pointer = Pattern%SendDesc(ipe)%type
        IF ( Pointer /= MPI_DATATYPE_NULL ) THEN
          CALL MPI_TYPE_FREE( Pointer, Ierror )
        ENDIF
        Pointer = Pattern%RecvDesc(ipe)%type
        IF ( Pointer /= MPI_DATATYPE_NULL ) THEN
          CALL MPI_TYPE_FREE( Pointer, Ierror )
        ENDIF
      ENDDO

      DO ipe=1, size(Pattern%RecvDesc)
        DEALLOCATE( Pattern%RecvDesc(ipe)%Displacements )
        DEALLOCATE( Pattern%RecvDesc(ipe)%BlockSizes )
      ENDDO
      DO ipe=1, size(Pattern%SendDesc)
        DEALLOCATE( Pattern%SendDesc(ipe)%Displacements )
        DEALLOCATE( Pattern%SendDesc(ipe)%BlockSizes )
      ENDDO

      DEALLOCATE( Pattern%SendDesc )
      DEALLOCATE( Pattern%RecvDesc )

      CPP_LEAVE_PROCEDURE( "PARPATTERNFREE" )
      RETURN
!EOC
      END SUBROUTINE ParPatternFree
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParScatterReal --- Scatter slice to all PEs
!
! !INTERFACE:

      SUBROUTINE ParScatterReal ( InComm, Root, Slice, Decomp, Local ) 1,1

! !USES:
      USE decompmodule, ONLY:  DecompType, Lists
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )          :: InComm       ! Communicator
      INTEGER, INTENT( IN )          :: Root         ! Root PE
      REAL(CPP_REAL8), INTENT( IN )  :: Slice(*)     ! Global Slice
      TYPE(DecompType), INTENT( IN ) :: Decomp       ! Decomp information

! !OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( OUT ) :: Local(*)     ! Local Slice

! !DESCRIPTION:
!     Given a decomposition of the domain, dole out a slice 
!     (one-dimensional array) to all the constituent PEs as described
!     by the decomposition Decomp.
!
!
! !SYSTEM ROUTINES:
!     MPI_ISEND, MPI_RECV, MPI_COMM_RANK
!
! !REVISION HISTORY:
!   97.04.14   Sawyer     Creation
!   97.04.16   Sawyer     Cleaned up for walk-through
!   97.05.01   Sawyer     Use Decomp%Comm for all local info
!   97.05.18   Sawyer     DecompType has moved to ParUtilitiesTypes
!   97.05.29   Sawyer     Changed 2-D arrays to 1-D
!   97.07.03   Sawyer     Reformulated documentation
!   97.07.22   Sawyer     DecompType has moved to DecompModule
!   97.12.01   Sawyer     Changed MPI_SSEND to MPI_ISEND
!   97.12.05   Sawyer     Added InComm and Root as arguments
!   97.12.05   Sawyer     Added logic to support intercommunicators
!   98.01.24   Sawyer     Removed dependence on MPI derived types TESTED
!   98.02.05   Sawyer     Removed the use of intercommunicators
!   98.03.30   Sawyer     Stats dimension corrected: Gsize*MPI_STATUS_SIZE
!   99.01.19   Sawyer     Dropped assumed-size arrays
!   00.07.07   Sawyer     Removed "1D" references
!   00.07.23   Sawyer     Implementation with shared memory arenas
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:

      INTEGER Ierror, I, J, K, L, Iam, GroupSize
      INTEGER Status( MPI_STATUS_SIZE )
      Integer, allocatable :: Reqs(:), Stats(:)
      REAL(CPP_REAL8), ALLOCATABLE    :: SendBuf(:)
!
      CPP_ENTER_PROCEDURE( "PARSCATTERREAL" )
!
      CALL MPI_COMM_RANK( InComm, Iam, Ierror )
      CALL MPI_COMM_SIZE( InComm, GroupSize, Ierror )

      allocate (Reqs(GroupSize))
      allocate (Stats(GroupSize*MPI_STATUS_SIZE))

      IF ( Iam .EQ. Root ) THEN
        ALLOCATE( SendBuf( SUM( Decomp%NumEntries ) ) )
        L = 0
        DO I = 1, GroupSize
!
! Pick out the array sections to be sent.
! This is the inverse of the operation in ParGather
!
          DO J = 1, SIZE( Decomp%HEAD(I)%StartTags )
            DO K = Decomp%HEAD(I)%StartTags(J),Decomp%HEAD(I)%EndTags(J)
              L = L+1
              SendBuf(L) = Slice(K)
            ENDDO
          ENDDO
!
! This is a non-blocking send. SendBuf cannot be immediately deallocated
!
! WARNING: F90-MPI inconsistency: make sure the indexing below always works
!
          CALL MPI_ISEND( SendBuf(L-Decomp%NumEntries(I)+1),             &
                          Decomp%NumEntries(I), CPP_MPI_REAL8,           &
                          I-1, 0, InComm, Reqs(I), Ierror )

        ENDDO
      ENDIF

!
! All receive from the root.  
!
! The local array may be larger than that specified in the decomposition
!
      CALL MPI_RECV( Local, Decomp%NumEntries(Iam+1),                    &
                     CPP_MPI_REAL8,                                      &
                     Root, 0, InComm, Status, Ierror )
!
! Experience shows that we should wait for all the non-blocking
! PEs to check in, EVEN THOUGH THE MPI_RECV HAS COMPLETED !!
!
      IF ( Iam .EQ. Root ) THEN
        CALL MPI_WAITALL( GroupSize, Reqs, Stats, Ierror )
        DEALLOCATE( SendBuf )
      ENDIF

!
! The following may be needed on some platforms to avoid an MPI bug.
!
      CALL MPI_BARRIER( InComm, Ierror )

      deallocate (Reqs)
      deallocate (Stats)

      CPP_LEAVE_PROCEDURE( "PARSCATTERREAL" )
      RETURN
!EOC
      END SUBROUTINE ParScatterReal
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParScatterReal4 --- Scatter slice to all PEs
!
! !INTERFACE:

      SUBROUTINE ParScatterReal4 ( InComm, Root, Slice, Decomp, Local ) 1,1

! !USES:
      USE decompmodule, ONLY:  DecompType, Lists
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )          :: InComm       ! Communicator
      INTEGER, INTENT( IN )          :: Root         ! Root PE
      REAL(CPP_REAL4), INTENT( IN )  :: Slice(*)     ! Global Slice
      TYPE(DecompType), INTENT( IN ) :: Decomp       ! Decomp information

! !OUTPUT PARAMETERS:
      REAL(CPP_REAL4), INTENT( OUT ) :: Local(*)     ! Local Slice

! !DESCRIPTION:
!     Given a decomposition of the domain, dole out a slice 
!     (one-dimensional array) to all the constituent PEs as described
!     by the decomposition Decomp.
!
!
! !SYSTEM ROUTINES:
!     MPI_ISEND, MPI_RECV, MPI_COMM_RANK
!
! !REVISION HISTORY:
!   97.04.14   Sawyer     Creation
!   97.04.16   Sawyer     Cleaned up for walk-through
!   97.05.01   Sawyer     Use Decomp%Comm for all local info
!   97.05.18   Sawyer     DecompType has moved to ParUtilitiesTypes
!   97.05.29   Sawyer     Changed 2-D arrays to 1-D
!   97.07.03   Sawyer     Reformulated documentation
!   97.07.22   Sawyer     DecompType has moved to DecompModule
!   97.12.01   Sawyer     Changed MPI_SSEND to MPI_ISEND
!   97.12.05   Sawyer     Added InComm and Root as arguments
!   97.12.05   Sawyer     Added logic to support intercommunicators
!   98.01.24   Sawyer     Removed dependence on MPI derived types TESTED
!   98.02.05   Sawyer     Removed the use of intercommunicators
!   98.03.30   Sawyer     Stats dimension corrected: Gsize*MPI_STATUS_SIZE
!   99.01.19   Sawyer     Dropped assumed-size arrays
!   00.07.07   Sawyer     Removed "1D" references
!   00.07.23   Sawyer     Implementation with shared memory arenas
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:

      INTEGER Ierror, I, J, K, L, Iam, GroupSize
      INTEGER Status( MPI_STATUS_SIZE )
      Integer, allocatable :: Reqs(:), Stats(:)
      REAL(CPP_REAL4), ALLOCATABLE    :: SendBuf(:)
!
      CPP_ENTER_PROCEDURE( "PARSCATTERREAL4" )
!
      CALL MPI_COMM_RANK( InComm, Iam, Ierror )
      CALL MPI_COMM_SIZE( InComm, GroupSize, Ierror )

      allocate (Reqs(GroupSize))
      allocate (Stats(GroupSize*MPI_STATUS_SIZE))

      IF ( Iam .EQ. Root ) THEN
        ALLOCATE( SendBuf( SUM( Decomp%NumEntries ) ) )
        L = 0
        DO I = 1, GroupSize
!
! Pick out the array sections to be sent.
! This is the inverse of the operation in ParGather
!
          DO J = 1, SIZE( Decomp%HEAD(I)%StartTags )
            DO K = Decomp%HEAD(I)%StartTags(J),Decomp%HEAD(I)%EndTags(J)
              L = L+1
              SendBuf(L) = Slice(K)
            ENDDO
          ENDDO
!
! This is a non-blocking send. SendBuf cannot be immediately deallocated
!
! WARNING: F90-MPI inconsistency: make sure the indexing below always works
!
          CALL MPI_ISEND( SendBuf(L-Decomp%NumEntries(I)+1),             &
                          Decomp%NumEntries(I), CPP_MPI_REAL4,           &
                          I-1, 0, InComm, Reqs(I), Ierror )

        ENDDO
      ENDIF

!
! All receive from the root.  
!
! The local array may be larger than that specified in the decomposition
!
      CALL MPI_RECV( Local, Decomp%NumEntries(Iam+1),                    &
                     CPP_MPI_REAL4,                                      &
                     Root, 0, InComm, Status, Ierror )
!
! Experience shows that we should wait for all the non-blocking
! PEs to check in, EVEN THOUGH THE MPI_RECV HAS COMPLETED !!
!
      IF ( Iam .EQ. Root ) THEN
        CALL MPI_WAITALL( GroupSize, Reqs, Stats, Ierror )
        DEALLOCATE( SendBuf )
      ENDIF

!
! The following may be needed on some platforms to avoid an MPI bug.
!
      CALL MPI_BARRIER( InComm, Ierror )

      deallocate (Reqs)
      deallocate (Stats)

      CPP_LEAVE_PROCEDURE( "PARSCATTERREAL4" )
      RETURN
!EOC
      END SUBROUTINE ParScatterReal4
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParScatterInt --- Scatter slice to all PEs
!
! !INTERFACE:

      SUBROUTINE ParScatterInt ( InComm, Root, Slice, Decomp, Local ) 1,1

! !USES:
      USE decompmodule, ONLY:  DecompType, Lists
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )          :: InComm       ! Communicator
      INTEGER, INTENT( IN )          :: Root         ! Root PE
      INTEGER, INTENT( IN )          :: Slice(*)     ! Global Slice
      TYPE(DecompType), INTENT( IN ) :: Decomp       ! Decomp information

! !OUTPUT PARAMETERS:
      INTEGER, INTENT( OUT )         :: Local(*)     ! Local Slice

! !DESCRIPTION:
!     Given a decomposition of the domain, dole out a slice 
!     (one-dimensional array) to all the constituent PEs as described
!     by the decomposition Decomp.
!
!
! !SYSTEM ROUTINES:
!     MPI_ISEND, MPI_RECV, MPI_COMM_RANK
!
! !REVISION HISTORY:
!   97.04.14   Sawyer     Creation
!   97.04.16   Sawyer     Cleaned up for walk-through
!   97.05.01   Sawyer     Use Decomp%Comm for all local info
!   97.05.18   Sawyer     DecompType has moved to ParUtilitiesTypes
!   97.05.29   Sawyer     Changed 2-D arrays to 1-D
!   97.07.03   Sawyer     Reformulated documentation
!   97.07.22   Sawyer     DecompType has moved to DecompModule
!   97.12.01   Sawyer     Changed MPI_SSEND to MPI_ISEND
!   97.12.05   Sawyer     Added InComm and Root as arguments
!   97.12.05   Sawyer     Added logic to support intercommunicators
!   98.01.24   Sawyer     Removed dependence on MPI derived types TESTED
!   98.02.05   Sawyer     Removed the use of intercommunicators
!   98.03.30   Sawyer     Stats dimension corrected: Gsize*MPI_STATUS_SIZE
!   99.01.19   Sawyer     Dropped assumed-size arrays
!   00.07.07   Sawyer     Removed "1D" references
!   00.07.23   Sawyer     Implementation with shared memory arenas
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:

      INTEGER Ierror, I, J, K, L, Iam, GroupSize
      INTEGER Status( MPI_STATUS_SIZE )
      Integer, allocatable :: Reqs(:), Stats(:)
      INTEGER, ALLOCATABLE    :: SendBuf(:)
!
      CPP_ENTER_PROCEDURE( "PARSCATTERINT" )
!
      CALL MPI_COMM_RANK( InComm, Iam, Ierror )
      CALL MPI_COMM_SIZE( InComm, GroupSize, Ierror )

      allocate (Reqs(GroupSize))
      allocate (Stats(GroupSize*MPI_STATUS_SIZE))

      IF ( Iam .EQ. Root ) THEN
        ALLOCATE( SendBuf( SUM( Decomp%NumEntries ) ) )
        L = 0
        DO I = 1, GroupSize
!
! Pick out the array sections to be sent.
! This is the inverse of the operation in ParGather
!
          DO J = 1, SIZE( Decomp%HEAD(I)%StartTags )
            DO K = Decomp%HEAD(I)%StartTags(J),Decomp%HEAD(I)%EndTags(J)
              L = L+1
              SendBuf(L) = Slice(K)
            ENDDO
          ENDDO
!
! This is a non-blocking send. SendBuf cannot be immediately deallocated
!
! WARNING: F90-MPI inconsistency: make sure the indexing below always works
!
          CALL MPI_ISEND( SendBuf(L-Decomp%NumEntries(I)+1),              &
                          Decomp%NumEntries(I), CPP_MPI_INTEGER,          &
                          I-1, 0, InComm, Reqs(I), Ierror )

        ENDDO
      ENDIF

!
! All receive from the root.  
!
! The local array may be larger than that specified in the decomposition
!
      CALL MPI_RECV( Local, Decomp%NumEntries(Iam+1),                     &
                     CPP_MPI_INTEGER,                                     &
                     Root, 0, InComm, Status, Ierror )
!
! Experience shows that we should wait for all the non-blocking
! PEs to check in, EVEN THOUGH THE MPI_RECV HAS COMPLETED !!
!
      IF ( Iam .EQ. Root ) THEN
        CALL MPI_WAITALL( GroupSize, Reqs, Stats, Ierror )
        DEALLOCATE( SendBuf )
      ENDIF

!
! The following may be needed on some platforms to avoid an MPI bug.
!
      CALL MPI_BARRIER( InComm, Ierror )

      deallocate (Reqs)
      deallocate (Stats)

      CPP_LEAVE_PROCEDURE( "PARSCATTERINT" )
      RETURN
!EOC
      END SUBROUTINE ParScatterInt
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParGatherReal --- Gather Slice from all PEs
!
! !INTERFACE:  

      SUBROUTINE ParGatherReal ( InComm, Root, Local, Decomp, Slice ) 1,1

! !USES:
      USE decompmodule, ONLY:  DecompType, Lists
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )          :: InComm       ! Communicator
      INTEGER, INTENT( IN )          :: Root         ! Root PE
      REAL(CPP_REAL8), INTENT( IN )  :: Local(*)     ! Local Slice
      TYPE(DecompType), INTENT( IN ) :: Decomp       ! Decomp information

! !OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( OUT ) :: Slice(*)     ! Global Slice

! !DESCRIPTION:
!     Given a decomposition of the domain and a local portion of the
!     total slice on each PE, gather together the portions into a
!     global slice on the root PE
!
! !SYSTEM ROUTINES:
!     MPI_ISEND, MPI_RECV, MPI_COMM_RANK
!
! !REVISION HISTORY:
!   97.04.14   Sawyer     Creation
!   97.04.16   Sawyer     Cleaned up for walk-through
!   97.05.01   Sawyer     Use Decomp%Comm for all local info
!   97.05.18   Sawyer     DecompType has moved to ParUtilitiesTypes
!   97.05.29   Sawyer     Changed 2-D arrays to 1-D
!   97.07.03   Sawyer     Reformulated documentation
!   97.07.22   Sawyer     DecompType has moved to DecompModule
!   97.12.01   Sawyer     Changed MPI_SSEND to MPI_ISEND
!   97.12.05   Sawyer     Added InComm and Root as arguments
!   97.12.05   Sawyer     Added logic to support intercommunicators
!   98.01.24   Sawyer     Removed dependence on MPI derived types TESTED
!   98.01.29   Sawyer     Corrected assertions
!   98.02.05   Sawyer     Removed the use of intercommunicators
!   98.03.31   Sawyer     Stat dimension corrected: MPI_STATUS_SIZE
!   98.04.22   Sawyer     Local no longer assumed shape: Local(*)
!   99.01.19   Sawyer     Dropped assumed-size arrays
!   00.07.07   Sawyer     Removed "1D" references
!   00.07.23   Sawyer     Implementation with shared memory arenas
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER Ierror, I, J, K, L, Iam, GroupSize, Req
      INTEGER Status( MPI_STATUS_SIZE ), Stat( MPI_STATUS_SIZE )
      REAL(CPP_REAL8), ALLOCATABLE    :: RecvBuf(:)
!
      CPP_ENTER_PROCEDURE( "PARGATHERREAL" )
!
      CALL MPI_COMM_RANK( InComm, Iam, Ierror )
      CALL MPI_COMM_SIZE( InComm, GroupSize, Ierror )
!
! All PEs send their contribution to the root
!
      CALL MPI_ISEND( Local, Decomp%NumEntries(Iam+1),                   &
                      CPP_MPI_REAL8,                                     &
                      Root, Iam+3001, InComm, Req, Ierror )

      IF ( Iam .EQ. Root ) THEN
        ALLOCATE( RecvBuf( SUM( Decomp%NumEntries ) ) )
!
! On the Root PE receive from every other PE
!
        L = 0
        DO I = 1, GroupSize
!
! This is a blocking, synchronous recv.  All the
! sends should have been posted so it should not deadlock
!
! WARNING: F90-MPI inconsistency: make sure the indexing below always works
!
          CPP_ASSERT_F90( L .LT. SIZE( RecvBuf ) )
          CALL MPI_RECV( RecvBuf(L+1), Decomp%NumEntries(I),             &
                         CPP_MPI_REAL8, I-1, I+3000, InComm,             &
                         Status, Ierror )
!
! This is the simple reverse mapping of that in ParScatter
!
          DO J = 1, SIZE( Decomp%HEAD(I)%StartTags )
            DO K = Decomp%HEAD(I)%StartTags(J),Decomp%HEAD(I)%EndTags(J)
              L = L + 1
              Slice(K) = RecvBuf(L)
#if defined(DEBUG_PARGATHERREAL)
                PRINT *, " Entry ", L, RecvBuf(L), K, SIZE(Slice)
#endif
            ENDDO
          ENDDO
        ENDDO
        DEALLOCATE( RecvBuf )
      ENDIF
      CALL MPI_WAIT( Req, Stat, Ierror )
!
! The following may be needed on some platforms to avoid an MPI bug.
!
      CALL MPI_BARRIER( InComm, Ierror )

      CPP_LEAVE_PROCEDURE( "PARGATHERREAL" )
      RETURN
!EOC
      END SUBROUTINE ParGatherReal
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParGatherReal4 --- Gather Slice from all PEs
!
! !INTERFACE:  

      SUBROUTINE ParGatherReal4 ( InComm, Root, Local, Decomp, Slice ) 1,1

! !USES:
      USE decompmodule, ONLY:  DecompType, Lists
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )          :: InComm       ! Communicator
      INTEGER, INTENT( IN )          :: Root         ! Root PE
      REAL(CPP_REAL4), INTENT( IN )  :: Local(*)     ! Local Slice
      TYPE(DecompType), INTENT( IN ) :: Decomp       ! Decomp information

! !OUTPUT PARAMETERS:
      REAL(CPP_REAL4), INTENT( OUT ) :: Slice(*)     ! Global Slice

! !DESCRIPTION:
!     Given a decomposition of the domain and a local portion of the
!     total slice on each PE, gather together the portions into a
!     global slice on the root PE
!
! !SYSTEM ROUTINES:
!     MPI_ISEND, MPI_RECV, MPI_COMM_RANK
!
! !REVISION HISTORY:
!   97.04.14   Sawyer     Creation
!   97.04.16   Sawyer     Cleaned up for walk-through
!   97.05.01   Sawyer     Use Decomp%Comm for all local info
!   97.05.18   Sawyer     DecompType has moved to ParUtilitiesTypes
!   97.05.29   Sawyer     Changed 2-D arrays to 1-D
!   97.07.03   Sawyer     Reformulated documentation
!   97.07.22   Sawyer     DecompType has moved to DecompModule
!   97.12.01   Sawyer     Changed MPI_SSEND to MPI_ISEND
!   97.12.05   Sawyer     Added InComm and Root as arguments
!   97.12.05   Sawyer     Added logic to support intercommunicators
!   98.01.24   Sawyer     Removed dependence on MPI derived types TESTED
!   98.01.29   Sawyer     Corrected assertions
!   98.02.05   Sawyer     Removed the use of intercommunicators
!   98.03.31   Sawyer     Stat dimension corrected: MPI_STATUS_SIZE
!   98.04.22   Sawyer     Local no longer assumed shape: Local(*)
!   99.01.19   Sawyer     Dropped assumed-size arrays
!   00.07.07   Sawyer     Removed "1D" references
!   00.07.23   Sawyer     Implementation with shared memory arenas
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER Ierror, I, J, K, L, Iam, GroupSize, Req
      INTEGER Status( MPI_STATUS_SIZE ), Stat( MPI_STATUS_SIZE )
      REAL(CPP_REAL4), ALLOCATABLE    :: RecvBuf(:)
!
      CPP_ENTER_PROCEDURE( "PARGATHERREAL4" )
!
      CALL MPI_COMM_RANK( InComm, Iam, Ierror )
      CALL MPI_COMM_SIZE( InComm, GroupSize, Ierror )
!
! All PEs send their contribution to the root
!
      CALL MPI_ISEND( Local, Decomp%NumEntries(Iam+1),                   &
                      CPP_MPI_REAL4,                                     &
                      Root, Iam+3001, InComm, Req, Ierror )

      IF ( Iam .EQ. Root ) THEN
        ALLOCATE( RecvBuf( SUM( Decomp%NumEntries ) ) )
!
! On the Root PE receive from every other PE
!
        L = 0
        DO I = 1, GroupSize
!
! This is a blocking, synchronous recv.  All the
! sends should have been posted so it should not deadlock
!
! WARNING: F90-MPI inconsistency: make sure the indexing below always works
!
          CPP_ASSERT_F90( L .LT. SIZE( RecvBuf ) )
          CALL MPI_RECV( RecvBuf(L+1), Decomp%NumEntries(I),             &
                         CPP_MPI_REAL4, I-1, I+3000, InComm,             &
                         Status, Ierror )
!
! This is the simple reverse mapping of that in ParScatter
!
          DO J = 1, SIZE( Decomp%HEAD(I)%StartTags )
            DO K = Decomp%HEAD(I)%StartTags(J),Decomp%HEAD(I)%EndTags(J)
              L = L + 1
              Slice(K) = RecvBuf(L)
#if defined(DEBUG_PARGATHERREAL4)
                PRINT *, " Entry ", L, RecvBuf(L), K, SIZE(Slice)
#endif
            ENDDO
          ENDDO
        ENDDO
        DEALLOCATE( RecvBuf )
      ENDIF
      CALL MPI_WAIT( Req, Stat, Ierror )
!
! The following may be needed on some platforms to avoid an MPI bug.
!
      CALL MPI_BARRIER( InComm, Ierror )
      CPP_LEAVE_PROCEDURE( "PARGATHERREAL4" )
      RETURN
!EOC
      END SUBROUTINE ParGatherReal4
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParGatherInt --- Gather Slice from all PEs
!
! !INTERFACE:  

      SUBROUTINE ParGatherInt ( InComm, Root, Local, Decomp, Slice ) 2,1

! !USES:
      USE decompmodule, ONLY:  DecompType, Lists
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )          :: InComm       ! Communicator
      INTEGER, INTENT( IN )          :: Root         ! Root PE
      INTEGER, INTENT( IN )          :: Local(*)     ! Local Slice
      TYPE(DecompType), INTENT( IN ) :: Decomp       ! Decomp information

! !OUTPUT PARAMETERS:
      INTEGER, INTENT( OUT )         :: Slice(*)     ! Global Slice

! !DESCRIPTION:
!     Given a decomposition of the domain and a local portion of the
!     total slice on each PE, gather together the portions into a
!     global slice on the root PE
!
! !SYSTEM ROUTINES:
!     MPI_ISEND, MPI_RECV, MPI_COMM_RANK
!
! !REVISION HISTORY:
!   97.04.14   Sawyer     Creation
!   97.04.16   Sawyer     Cleaned up for walk-through
!   97.05.01   Sawyer     Use Decomp%Comm for all local info
!   97.05.18   Sawyer     DecompType has moved to ParUtilitiesTypes
!   97.05.29   Sawyer     Changed 2-D arrays to 1-D
!   97.07.03   Sawyer     Reformulated documentation
!   97.07.22   Sawyer     DecompType has moved to DecompModule
!   97.12.01   Sawyer     Changed MPI_SSEND to MPI_ISEND
!   97.12.05   Sawyer     Added InComm and Root as arguments
!   97.12.05   Sawyer     Added logic to support intercommunicators
!   98.01.24   Sawyer     Removed dependence on MPI derived types TESTED
!   98.01.29   Sawyer     Corrected assertions
!   98.02.05   Sawyer     Removed the use of intercommunicators
!   98.03.31   Sawyer     Stat dimension corrected: MPI_STATUS_SIZE
!   98.04.22   Sawyer     Local no longer assumed shape: Local(*)
!   99.01.19   Sawyer     Dropped assumed-size arrays
!   00.07.07   Sawyer     Removed "1D" references
!   00.07.23   Sawyer     Implementation with shared memory arenas
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER Ierror, I, J, K, L, Iam, GroupSize, Req
      INTEGER Status( MPI_STATUS_SIZE ), Stat( MPI_STATUS_SIZE )
      INTEGER, ALLOCATABLE    :: RecvBuf(:)
!
      CPP_ENTER_PROCEDURE( "PARGATHERINT" )
!
      CALL MPI_COMM_RANK( InComm, Iam, Ierror )
      CALL MPI_COMM_SIZE( InComm, GroupSize, Ierror )
!
! All PEs send their contribution to the root
!
      CALL MPI_ISEND( Local, Decomp%NumEntries(Iam+1), CPP_MPI_INTEGER,       &
                      Root, Iam+3001, InComm, Req, Ierror )

      IF ( Iam .EQ. Root ) THEN
        ALLOCATE( RecvBuf( SUM( Decomp%NumEntries ) ) )
!
! On the Root PE receive from every other PE
!
        L = 0
        DO I = 1, GroupSize
!
! This is a blocking, synchronous recv.  All the
! sends should have been posted so it should not deadlock
!
! WARNING: F90-MPI inconsistency: make sure the indexing below always works
!
          CPP_ASSERT_F90( L .LT. SIZE( RecvBuf ) )
          CALL MPI_RECV( RecvBuf(L+1), Decomp%NumEntries(I),                  &
                         CPP_MPI_INTEGER, I-1, I+3000, InComm,                &
                         Status, Ierror )
!
! This is the simple reverse mapping of that in ParScatter
!
          DO J = 1, SIZE( Decomp%HEAD(I)%StartTags )
            DO K = Decomp%HEAD(I)%StartTags(J),Decomp%HEAD(I)%EndTags(J)
              L = L + 1
              Slice(K) = RecvBuf(L)
#if defined(DEBUG_PARGATHERINT)
                PRINT *, " Entry ", L, RecvBuf(L), K, SIZE(Slice)
#endif
            ENDDO
          ENDDO
        ENDDO
        DEALLOCATE( RecvBuf )
      ENDIF
      CALL MPI_WAIT( Req, Stat, Ierror )
!
! The following may be needed on some platforms to avoid an MPI bug.
!
      CALL MPI_BARRIER( InComm, Ierror )

      CPP_LEAVE_PROCEDURE( "PARGATHERINT" )
      RETURN
!EOC
      END SUBROUTINE ParGatherInt
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParBeginTransferReal --- Start an ASYNC Real Transfer
!
! !INTERFACE:

      SUBROUTINE ParBeginTransferReal(InComm, NrInPackets, NrOutPackets, & 1
                                      Dest, Src, InBuf, InIA,            &
                                      OutBuf, OutIA )

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )       :: InComm       ! Communicator
      INTEGER, INTENT( IN )       :: NrInPackets  ! Number of in packets
      INTEGER, INTENT( IN )       :: NrOutPackets ! Number of out packets
      INTEGER, INTENT( IN )       :: Dest(:)      ! PE destinations
      INTEGER, INTENT( IN )       :: Src(:)       ! PE sources
      REAL(CPP_REAL8), INTENT(IN) :: InBuf(:)     ! Input buffer
      INTEGER, INTENT( IN )       :: InIA(:)      ! In packet counter
      INTEGER, INTENT( IN )       :: OutIA(:)     ! Out packet counter

! !OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( OUT ) :: OutBuf(:)  ! Output buffer

! !DESCRIPTION: 
!
!     This routine initiates an async. transfer of an array InBuf
!     partitioned into parcels defined by the arrays InIA and Dest
!     to an output array OutBuf on another PE. InIA(1) contains 
!     the number of reals to be sent to Dest(1), InIA(2) the number 
!     of reals to be sent to Dest(2), etc.  Similarly, the array
!     OutBuf on the calling PE is partitioned into parcels by OutIA
!     and Src, with OutIA(1) the number of reals anticipated from
!     Src(1), etc.  
!
!     The default implementation reads through the contiguous array 
!     InBuf and sends the parcels to the PEs designated with an 
!     asyncronous MPI\_ISEND.  Correspondingly it posts the receives 
!     with an asynchronous MPI\_IRECV.
!
!     Wait handles InHandle(:) and OutHandle(:) are in common block.
!
! !BUGS:
!
!     It is assumed that the buffers are passed to this routine by
!     reference!!!!!!!!!!
!
!     The buffers may not be accessed until after the call to 
!     ParEndTransferReal.
!
!
! !SYSTEM ROUTINES:
!     MPI_COMM_RANK, MPI_ISEND, MPI_IRECV
!
! !REVISION HISTORY:
!   97.09.26   Sawyer     Creation
!   97.12.05   Sawyer     Renamed Comm to InComm to avoid collisions
!   98.02.26   Sawyer     Added Dest, Src and Remote to clean up code
!   98.04.16   Sawyer     Number of packets become input arguments
!   98.09.04   Sawyer     Cleaned interface: handles in common, no Remote
!   99.03.04   Sawyer     Inlined ParCalculateRemote
!   99.06.01   Sawyer     Changed pointer arrays to INTEGER*8 for SGI
!   00.08.07   Sawyer     Implementation with shared memory arenas
!   01.09.27   Sawyer     Added multiple shared buffers for USE_MLP
!
!EOP
!-----------------------------------------------------------------------
!BOC

! !LOCAL VARIABLES:
      INTEGER Iam, GroupSize, Nr, Icnt, Packet, I, Ierr

      CPP_ENTER_PROCEDURE( "PARBEGINTRANSFERREAL" )
      CPP_ASSERT_F90( NrInPackets .LE. SIZE( Dest ) )
      CPP_ASSERT_F90( NrInPackets .LE. SIZE( InIA ) )
      CPP_ASSERT_F90( NrOutPackets .LE. SIZE( Src ) )
      CPP_ASSERT_F90( NrOutPackets .LE. SIZE( OutIA ) )

!
! Increment the ongoing transfer number
      BegTrf = MOD(BegTrf,MAX_TRF) + 1

      CALL MPI_COMM_RANK( InComm, Iam, Ierr )
      CALL MPI_COMM_SIZE( InComm, GroupSize, Ierr )

!
!     MPI: Irecv over all processes
!
      Icnt = 1
      DO Packet = 1, NrOutPackets
        Nr = OutIA( Packet )
        IF ( Nr .GT. 0 ) THEN
#if defined( DEBUG_PARBEGINTRANSFERREAL )
          PRINT *, "Iam ",Iam," posts recv ",Nr," from ", Src( Packet )
#endif
!
! Receive the buffers with MPI_Irecv. Non-blocking
!
          CPP_ASSERT_F90( Icnt+Nr-1 .LE. SIZE( OutBuf ) )
          CALL MPI_IRECV( OutBuf( Icnt ), Nr,                            &
                CPP_MPI_REAL8, Src( Packet ), Src( Packet ),             &
                InComm, OutHandle(Packet,1,BegTrf), Ierr )
        ELSE
          OutHandle(Packet,1,BegTrf) = MPI_REQUEST_NULL
        END IF
        Icnt = Icnt + Nr
      END DO
!
!     MPI: Isend over all processes
!
      Icnt = 1
      CPP_ASSERT_F90( NrInPackets .LE. SIZE( Dest ) )
      CPP_ASSERT_F90( NrInPackets .LE. SIZE( InIA ) )
      DO Packet = 1, NrInPackets
        Nr = InIA( Packet )
        IF ( Nr .GT. 0 ) THEN
#if defined( DEBUG_PARBEGINTRANSFERREAL )
          PRINT *,"Iam ",Iam," posts send ",Nr," to ",Dest( Packet )
#endif
!
!     Send the individual buffers with non-blocking sends
!
          CPP_ASSERT_F90( Icnt+Nr-1 .LE. SIZE( InBuf ) )
          CALL MPI_ISEND ( InBuf( Icnt ), Nr,                            &
                CPP_MPI_REAL8, Dest( Packet ), Iam,                      &
                InComm, InHandle(Packet,1,BegTrf), Ierr )
        ELSE
          InHandle(Packet,1,BegTrf) = MPI_REQUEST_NULL
        END IF
        Icnt = Icnt + Nr
      END DO
!
!
      CPP_LEAVE_PROCEDURE( "PARBEGINTRANSFERREAL" )
      RETURN
!EOC
      END SUBROUTINE ParBeginTransferReal
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParBeginTransferPattern1D --- Start ASYNC Pattern Transfer
!
! !INTERFACE:

      SUBROUTINE ParBeginTransferPattern1D( InComm, Pattern, InBuf, OutBuf ) 1,2

! !USES:
      USE mod_comm, ONLY : mp_sendirr
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )       :: InComm       ! Communicator
      TYPE (ParPatternType), INTENT( IN )  :: Pattern   ! Comm Pattern
      REAL(CPP_REAL8), INTENT( IN )        :: InBuf(*)  ! Input buffer

! !OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( OUT )       :: OutBuf(*) ! Output buffer

! !DESCRIPTION: 
!
!     This routine initiates an async. transfer of an array InBuf.
!     The communication pattern indicates the indices outgoing 
!     values of InBuf and  incoming values for OutBuf.  This routine
!     is fundamentally equivalent to ParBeginTransferReal; the use 
!     of a communication pattern is largely a performance enhancement, 
!     since it eliminates the need for intermediate buffering.
!     
!     Wait handles InHandle and OutHandle are module variables
!     The buffers may not be accessed until after the call to 
!     ParEndTransferReal.  
!
! !BUGS:
!
!     It is assumed that the buffers are passed to this routine by
!     reference.
!
! !REVISION HISTORY:
!   01.02.14   Sawyer     Creation from ParBeginTransferReal
!   01.09.27   Sawyer     Added multiple shared buffers for USE_MLP
!   02.08.13   Sawyer     Now uses mod_comm unless Use_Mpi_Types
!   03.06.24   Sawyer     All complexity now in mp_sendirr
!
!EOP
!-----------------------------------------------------------------------
!BOC

! !LOCAL VARIABLES:
      CPP_ENTER_PROCEDURE( "PARBEGINTRANSFERPATTERN1D" )

      CALL mp_sendirr( InComm,Pattern%SendDesc,Pattern%RecvDesc,InBuf,OutBuf )
!
      CPP_LEAVE_PROCEDURE( "PARBEGINTRANSFERPATTERN1D" )
      RETURN
!EOC
      END SUBROUTINE ParBeginTransferPattern1D
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParBeginTransferPattern1Dint --- Start ASYNC Pattern Transfer
!
! !INTERFACE:

      SUBROUTINE ParBeginTransferPattern1Dint( InComm, Pattern, InBuf, OutBuf ) 1,2

! !USES:
      USE mod_comm, ONLY : mp_sendirr_i4
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )       :: InComm       ! Communicator
      TYPE (ParPatternType), INTENT( IN )  :: Pattern   ! Comm Pattern
      INTEGER, INTENT( IN )                :: InBuf(*)  ! Input buffer

! !OUTPUT PARAMETERS:
      INTEGER, INTENT( OUT )               :: OutBuf(*) ! Output buffer

! !DESCRIPTION: 
!
!     This routine initiates an async. transfer of an array InBuf.
!     The communication pattern indicates the indices outgoing 
!     values of InBuf and  incoming values for OutBuf.  This routine
!     is fundamentally equivalent to ParBeginTransferReal; the use 
!     of a communication pattern is largely a performance enhancement, 
!     since it eliminates the need for intermediate buffering.
!     
!     Wait handles InHandle and OutHandle are module variables
!     The buffers may not be accessed until after the call to 
!     ParEndTransferReal.  
!
! !BUGS:
!
!     It is assumed that the buffers are passed to this routine by
!     reference.
!
! !REVISION HISTORY:
!   01.02.14   Sawyer     Creation from ParBeginTransferReal
!   01.09.27   Sawyer     Added multiple shared buffers for USE_MLP
!   02.08.13   Sawyer     Now uses mod_comm unless Use_Mpi_Types
!   03.06.24   Sawyer     All complexity now in mp_sendirr_i4
! 
!EOP
!-----------------------------------------------------------------------
!BOC

      CPP_ENTER_PROCEDURE( "PARBEGINTRANSFERPATTERN1DINT" )

      CALL mp_sendirr_i4( InComm,Pattern%SendDesc,Pattern%RecvDesc,InBuf,OutBuf )

      CPP_LEAVE_PROCEDURE( "PARBEGINTRANSFERPATTERN1DINT" )
      RETURN
!EOC
      END SUBROUTINE ParBeginTransferPattern1Dint
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParBeginTransferPattern2D --- Start an ASYNC Pattern Transfer
!
! !INTERFACE:

      SUBROUTINE ParBeginTransferPattern2D( InComm, Pattern, InBuf, OutBuf ) 1,2

! !USES:
      USE mod_comm, ONLY : mp_sendirr
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )       :: InComm       ! Communicator
      TYPE (ParPatternType), INTENT(IN)  :: Pattern      ! Comm Pattern
      REAL(CPP_REAL8), INTENT(IN)        :: InBuf(:,:)   ! Input buffer

! !OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT(OUT)       :: OutBuf(:,:)  ! Output buffer

! !DESCRIPTION: 
!
!     This routine initiates an async. transfer of an array InBuf.
!     The communication pattern indicates the indices outgoing 
!     values of InBuf and  incoming values for OutBuf.  This routine
!     is fundamentally equivalent to ParBeginTransferReal; the use 
!     of a communication pattern is largely a performance enhancement, 
!     since it eliminates the need for intermediate buffering.
!
!     Wait handles InHandle and OutHandle are module variables
!     The buffers may not be accessed until after the call to 
!     ParEndTransferReal.  
!
! !REVISION HISTORY:
!   01.10.01   Sawyer     Creation from ParBeginTransferPattern
!   02.08.13   Sawyer     Now uses mod_comm unless Use_Mpi_Types
!   03.06.24   Sawyer     All complexity now in mp_sendirr
! 
!EOP
!-----------------------------------------------------------------------
!BOC

      CPP_ENTER_PROCEDURE( "PARBEGINTRANSFERPATTERN2D" )

      CALL mp_sendirr( InComm,Pattern%SendDesc,Pattern%RecvDesc,InBuf,OutBuf )

      CPP_LEAVE_PROCEDURE( "PARBEGINTRANSFERPATTERN2D" )
      RETURN
!EOC
      END SUBROUTINE ParBeginTransferPattern2D
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParBeginTransferPattern3D --- Start an ASYNC Pattern Transfer
!
! !INTERFACE:

      SUBROUTINE ParBeginTransferPattern3D( InComm, Pattern, InBuf, OutBuf ) 1,2

! !USES:
      USE mod_comm, ONLY : mp_sendirr
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )       :: InComm       ! Communicator
      TYPE (ParPatternType), INTENT(IN)  :: Pattern      ! Comm Pattern
      REAL(CPP_REAL8), INTENT(IN)        :: InBuf(:,:,:) ! Input buffer

! !OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT(OUT)       :: OutBuf(:,:,:)! Output buffer

! !DESCRIPTION: 
!
!     This routine initiates an async. transfer of an array InBuf.
!     The communication pattern indicates the indices outgoing 
!     values of InBuf and  incoming values for OutBuf.  This routine
!     is fundamentally equivalent to ParBeginTransferReal; the use 
!     of a communication pattern is largely a performance enhancement, 
!     since it eliminates the need for intermediate buffering.
!
!     Wait handles InHandle and OutHandle are module variables
!     The buffers may not be accessed until after the call to 
!     ParEndTransferReal.  
!
! !REVISION HISTORY:
!   01.10.01   Sawyer     Creation from ParBeginTransferPattern
!   02.08.13   Sawyer     Now uses mod_comm unless Use_Mpi_Types
!   03.06.24   Sawyer     All complexity now in mp_sendirr
! 
!EOP
!-----------------------------------------------------------------------
!BOC

      CPP_ENTER_PROCEDURE( "PARBEGINTRANSFERPATTERN3D" )

      CALL mp_sendirr( InComm,Pattern%SendDesc,Pattern%RecvDesc,InBuf,OutBuf )

      CPP_LEAVE_PROCEDURE( "PARBEGINTRANSFERPATTERN3D" )
      RETURN
!EOC
      END SUBROUTINE ParBeginTransferPattern3D
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParBeginTransferPattern4D --- Start an ASYNC Pattern Transfer
!
! !INTERFACE:

      SUBROUTINE ParBeginTransferPattern4D( InComm, Pattern, InBuf, OutBuf ) 1,2

! !USES:
      USE mod_comm, ONLY : mp_sendirr
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )       :: InComm       ! Communicator
      TYPE (ParPatternType), INTENT(IN)  :: Pattern        ! Comm Pattern
      REAL(CPP_REAL8), INTENT(IN)        :: InBuf(:,:,:,:) ! Input buffer

! !OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT(OUT)       :: OutBuf(:,:,:,:)! Output buffer

! !DESCRIPTION: 
!
!     This routine initiates an async. transfer of an array InBuf.
!     The communication pattern indicates the indices outgoing 
!     values of InBuf and  incoming values for OutBuf.  This routine
!     is fundamentally equivalent to ParBeginTransferReal; the use 
!     of a communication pattern is largely a performance enhancement, 
!     since it eliminates the need for intermediate buffering.
!
!     Wait handles InHandle and OutHandle are module variables
!     The buffers may not be accessed until after the call to 
!     ParEndTransferReal.  
!
! !REVISION HISTORY:
!   02.12.19   Sawyer     Creation from ParBeginTransferPattern
!   03.06.24   Sawyer     All complexity now in mp_sendirr
! 
!EOP
!-----------------------------------------------------------------------
!BOC

      CPP_ENTER_PROCEDURE( "PARBEGINTRANSFERPATTERN4D" )

      CALL mp_sendirr( InComm,Pattern%SendDesc,Pattern%RecvDesc,InBuf,OutBuf )

      CPP_LEAVE_PROCEDURE( "PARBEGINTRANSFERPATTERN4D" )
      RETURN
!EOC
      END SUBROUTINE ParBeginTransferPattern4D
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParEndTransferReal --- Complete an ASYNC Real Transfer
!
! !INTERFACE:

      SUBROUTINE ParEndTransferReal( InComm, NrInPackets, NrOutPackets,  & 1
                                     Dest, Src, InBuf, InIA,             &
                                     OutBuf, OutIA )

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )       :: InComm       ! Communicator
      INTEGER, INTENT( IN )       :: NrInPackets  ! Number of in packets
      INTEGER, INTENT( IN )       :: NrOutPackets ! Number of out packets
      INTEGER, INTENT( IN )       :: Dest(:)      ! PE destinations
      INTEGER, INTENT( IN )       :: Src(:)       ! PE sources
      REAL(CPP_REAL8), INTENT(IN) :: InBuf(:)     ! Input buffer
      INTEGER, INTENT( IN )       :: InIA(:)      ! Pointer array
      INTEGER, INTENT( IN )       :: OutIA(:)     ! Pointer array

! !INPUT/OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( INOUT ) :: OutBuf(:)! Output buffer

! !DESCRIPTION: 
!
!     This routine completes an async. transfer of an array
!     partitioned into parcels defined by the array InIA.  In the 
!     MPI version, neither InBuf nor OutBuf is not used since
!     that information was utilized in ParBeginTransferReal.
!
!     The link between StartTransfer and EndTransfer is made possible
!     by the InHandle and OutHandle: they reflect the status of
!     the ongoing transfer.  When this routine completes, a valid
!     and accessible copy of the OutBuf is ready for use.
!
! !BUGS:
!
!     It is assumed that the buffers are passed to this routine by
!     reference! The buffers may not be accessed until after the 
!     completion of ParEndTransferReal.  
!
!
! !SYSTEM ROUTINES:
!     MPI_COMM_RANK, MPI_ISEND, MPI_IRECV
!
! !REVISION HISTORY:
!   97.09.26   Sawyer     Creation
!   97.12.05   Sawyer     Renamed Comm to InComm to avoid collisions
!   98.02.26   Sawyer     Count through packets, not PEs
!   98.04.16   Sawyer     Number of packets become input arguments
!   98.09.04   Sawyer     Cleaned interface: handles in common
!   99.03.05   Sawyer     Support for contiguous communicators in SHMEM
!   99.04.22   Sawyer     Bug fix: replaced MPI_WAIT with MPI_WAITALL
!   99.06.03   Sawyer     Bug fix: GroupSize in SHMEM_BARRIER
!   00.07.28   Sawyer     Implemented with shared memory arenas
!   01.09.27   Sawyer     Added multiple shared buffers for USE_MLP
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      INTEGER Iam, GroupSize, J, Offset, Packet, Ierr
      INTEGER InStats(NrInPackets*MPI_STATUS_SIZE)
      INTEGER OutStats(NrOutPackets*MPI_STATUS_SIZE)

      CPP_ENTER_PROCEDURE( "PARENDTRANSFERREAL" )

!
! Increment the receiver 
      EndTrf = MOD(EndTrf,MAX_TRF)+1

      CPP_ASSERT_F90( NrInPackets .LE. MAX_PAX )
      CALL MPI_WAITALL( NrInPackets, InHandle(:,1,EndTrf), InStats, Ierr )
 
      CPP_ASSERT_F90( NrOutPackets .LE. MAX_PAX )
      CALL MPI_WAITALL( NrOutPackets, OutHandle(:,1,EndTrf), OutStats, Ierr )
!
! WS 98.09.22 : This barrier needed to synchronize.
!
      CALL MPI_BARRIER( InComm, Ierr )

      CPP_LEAVE_PROCEDURE( "PARENDTRANSFERREAL" )
      RETURN
!EOC
      END SUBROUTINE ParEndTransferReal
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParEndTransferPattern1D --- Complete ASYNC Pattern Transfer
!
! !INTERFACE:

      SUBROUTINE ParEndTransferPattern1D( InComm, Pattern, InBuf, OutBuf ) 1,2

! !USES:
      USE mod_comm, ONLY : mp_recvirr
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )       :: InComm       ! Communicator
      TYPE (ParPatternType), INTENT( IN )  :: Pattern   ! Comm Pattern
      REAL(CPP_REAL8), INTENT( IN )        :: InBuf(*)  ! Input buffer

! !INPUT/OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( INOUT )     :: OutBuf(*) ! Output buffer

! !DESCRIPTION: 
!
!     This routine completes an async. transfer of an array communicated
!     with a communication pattern.  
!
!     The link between StartTransfer and EndTransfer is made possible
!     by the InHandle and OutHandle: they reflect the status of
!     the ongoing transfer.  When this routine completes, a valid
!     and accessible copy of the OutBuf is ready for use.
!     The buffers may not be accessed until after the 
!     completion of ParEndTransfer.  
!
! !BUGS:
!
!     It is assumed that the buffers are passed to this routine by
!     reference.
!
! !REVISION HISTORY:
!   01.02.14   Sawyer     Creation from ParEndTransferReal
!   02.08.13   Sawyer     Now uses mod_comm unless Use_Mpi_Types
!   03.06.24   Sawyer     All complexity now in mp_recvirr
!
!EOP
!-----------------------------------------------------------------------
!BOC

      CPP_ENTER_PROCEDURE( "PARENDTRANSFERPATTERN1D" )

      CALL mp_recvirr( InComm,Pattern%SendDesc,Pattern%RecvDesc,InBuf,OutBuf )

      CPP_LEAVE_PROCEDURE( "PARENDTRANSFERPATTERN1D" )
      RETURN
!EOC
      END SUBROUTINE ParEndTransferPattern1D
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParEndTransferPattern1Dint --- Complete ASYNC Pattern Transfer
!
! !INTERFACE:

      SUBROUTINE ParEndTransferPattern1Dint( InComm, Pattern, InBuf, OutBuf ) 1,2

! !USES:
      USE mod_comm, ONLY : mp_recvirr_i4
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )       :: InComm       ! Communicator
      TYPE (ParPatternType), INTENT( IN )  :: Pattern   ! Comm Pattern
      INTEGER, INTENT( IN )                :: InBuf(*)  ! Input buffer

! !INPUT/OUTPUT PARAMETERS:
      INTEGER, INTENT( INOUT )             :: OutBuf(*) ! Output buffer

! !DESCRIPTION: 
!
!     This routine completes an async. transfer of an array communicated
!     with a communication pattern.  
!
!     The link between StartTransfer and EndTransfer is made possible
!     by the InHandle and OutHandle: they reflect the status of
!     the ongoing transfer.  When this routine completes, a valid
!     and accessible copy of the OutBuf is ready for use.
!     The buffers may not be accessed until after the 
!     completion of ParEndTransfer.  
!
! !BUGS:
!
!     It is assumed that the buffers are passed to this routine by
!     reference.
!
! !REVISION HISTORY:
!   01.02.14   Sawyer     Creation from ParEndTransferReal
!   02.08.13   Sawyer     Now uses mod_comm unless Use_Mpi_Types
!   03.06.24   Sawyer     All complexity now in mp_recvirr_i4
!
!EOP
!-----------------------------------------------------------------------
!BOC

      CPP_ENTER_PROCEDURE( "PARENDTRANSFERPATTERN1DINT" )

      CALL mp_recvirr_i4( InComm,Pattern%SendDesc,Pattern%RecvDesc,InBuf,OutBuf )

      CPP_LEAVE_PROCEDURE( "PARENDTRANSFERPATTERN1DINT" )
      RETURN
!EOC
      END SUBROUTINE ParEndTransferPattern1Dint
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParEndTransferPattern2D --- Complete an ASYNC Pattern Transfer
!
! !INTERFACE:

      SUBROUTINE ParEndTransferPattern2D( InComm, Pattern, InBuf, OutBuf ) 1,2

! !USES:
      USE mod_comm, ONLY : mp_recvirr
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )       :: InComm       ! Communicator
      TYPE (ParPatternType), INTENT( IN )  :: Pattern       ! Comm Pattern
      REAL(CPP_REAL8), INTENT( IN )        :: InBuf(:,:)    ! Input buffer

! !INPUT/OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( INOUT )     :: OutBuf(:,:)   ! Output buffer

! !DESCRIPTION: 
!
!     This routine completes an async. transfer of an array communicated
!     with a communication pattern.  
!
!     The link between StartTransfer and EndTransfer is made possible
!     by the InHandle and OutHandle: they reflect the status of
!     the ongoing transfer.  When this routine completes, a valid
!     and accessible copy of the OutBuf is ready for use.
!     The buffers may not be accessed until after the 
!     completion of ParEndTransfer.  
!
! !BUGS:
!
!     It is assumed that the buffers are passed to this routine by
!     reference.
!
! !REVISION HISTORY:
!   01.10.01   Sawyer     Creation from ParEndTransferPattern
!   02.08.13   Sawyer     Now uses mod_comm unless Use_Mpi_Types
!   03.06.24   Sawyer     All complexity now in mp_recvirr
!
!EOP
!-----------------------------------------------------------------------
!BOC

      CPP_ENTER_PROCEDURE( "PARENDTRANSFERPATTERN2D" )

      CALL mp_recvirr( InComm,Pattern%SendDesc,Pattern%RecvDesc,InBuf(:,:),OutBuf(:,:) )

      CPP_LEAVE_PROCEDURE( "PARENDTRANSFERPATTERN2D" )
      RETURN
!EOC
      END SUBROUTINE ParEndTransferPattern2D
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParEndTransferPattern3D --- Complete an ASYNC Pattern Transfer
!
! !INTERFACE:

      SUBROUTINE ParEndTransferPattern3D( InComm, Pattern, InBuf, OutBuf ) 1,2

! !USES:
      USE mod_comm, ONLY : mp_recvirr
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )       :: InComm       ! Communicator
      TYPE (ParPatternType), INTENT( IN )  :: Pattern       ! Comm Pattern
      REAL(CPP_REAL8), INTENT( IN )        :: InBuf(:,:,:)  ! Input buffer

! !INPUT/OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( INOUT )     :: OutBuf(:,:,:) ! Output buffer

! !DESCRIPTION: 
!
!     This routine completes an async. transfer of an array communicated
!     with a communication pattern.  
!
!     The link between StartTransfer and EndTransfer is made possible
!     by the InHandle and OutHandle: they reflect the status of
!     the ongoing transfer.  When this routine completes, a valid
!     and accessible copy of the OutBuf is ready for use.
!     The buffers may not be accessed until after the 
!     completion of ParEndTransfer.  
!
! !BUGS:
!
!     It is assumed that the buffers are passed to this routine by
!     reference.
!
! !REVISION HISTORY:
!   01.10.01   Sawyer     Creation from ParEndTransferPattern
!   02.08.13   Sawyer     Now uses mod_comm unless Use_Mpi_Types
!   03.06.24   Sawyer     All complexity now in mp_recvirr
!
!EOP
!-----------------------------------------------------------------------
!BOC

      CPP_ENTER_PROCEDURE( "PARENDTRANSFERPATTERN3D" )

      CALL mp_recvirr( InComm,Pattern%SendDesc,Pattern%RecvDesc,InBuf(:,:,:),OutBuf(:,:,:) )

      CPP_LEAVE_PROCEDURE( "PARENDTRANSFERPATTERN3D" )
      RETURN
!EOC
      END SUBROUTINE ParEndTransferPattern3D
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParEndTransferPattern4D --- Complete an ASYNC Pattern Transfer
!
! !INTERFACE:

      SUBROUTINE ParEndTransferPattern4D( InComm, Pattern, InBuf, OutBuf ) 1,2

! !USES:
      USE mod_comm, ONLY : mp_recvirr
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )       :: InComm       ! Communicator
      TYPE (ParPatternType), INTENT( IN )  :: Pattern        ! Comm Pattern
      REAL(CPP_REAL8), INTENT( IN )        :: InBuf(:,:,:,:) ! Input buffer

! !INPUT/OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( INOUT )     :: OutBuf(:,:,:,:)! Output buffer

! !DESCRIPTION: 
!
!     This routine completes an async. transfer of an array communicated
!     with a communication pattern.  
!
!     The link between StartTransfer and EndTransfer is made possible
!     by the InHandle and OutHandle: they reflect the status of
!     the ongoing transfer.  When this routine completes, a valid
!     and accessible copy of the OutBuf is ready for use.
!     The buffers may not be accessed until after the 
!     completion of ParEndTransfer.  
!
! !BUGS:
!
!     It is assumed that the buffers are passed to this routine by
!     reference.
!
! !REVISION HISTORY:
!   02.12.19   Sawyer     Creation from ParEndTransferPattern
!   03.06.24   Sawyer     All complexity now in mp_recvirr
!
!EOP
!-----------------------------------------------------------------------
!BOC

      CPP_ENTER_PROCEDURE( "PARENDTRANSFERPATTERN4D" )

      CALL mp_recvirr( InComm,Pattern%SendDesc,Pattern%RecvDesc,InBuf,OutBuf )

      CPP_LEAVE_PROCEDURE( "PARENDTRANSFERPATTERN4D" )
      RETURN
!EOC
      END SUBROUTINE ParEndTransferPattern4D
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParExchangeVectorReal --- Exchange a sparse packed vector
!
! !INTERFACE:  

      SUBROUTINE ParExchangeVectorReal ( InComm, LenInVector, InVector,  & 1
                                         LenOutVector, OutVector )

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )   :: InComm            ! Communicator
      INTEGER, INTENT( IN )   :: LenInVector( * )  ! Length on each PE
      REAL(CPP_REAL8), INTENT( IN ):: InVector( * ) ! The input buffer

! !OUTPUT PARAMETERS:
      INTEGER, INTENT( OUT )  :: LenOutVector( * ) ! Length on each PE
      REAL(CPP_REAL8), INTENT( OUT ) :: OutVector( * ) ! The output buffer

! !DESCRIPTION:
!
!     This routine exchanges vectors stored in compressed format, i.e.,
!     in so-called compressed sparse row (CSR) format, with other
!     PEs.  In essence it first exchanges the lengths with
!     MPI\_Alltoall, then the exchange of the actual vectors (can be
!     different in size) using MPI\_AlltoallV.  Since the latter is
!     inefficient, it is simulated using MPI\_Isend and MPI\_Recv.
!
! !SYSTEM ROUTINES:
!     MPI_ISEND, MPI_RECV, MPI_WAITALL, MPI_ALLTOALL
!
! !REVISION HISTORY:
!   98.03.17   Sawyer     Creation from F77 version
!   98.03.30   Sawyer     Removed assumed shape arrays due to problems
!   99.01.18   Sawyer     Added barrier for safety
!   99.03.08   Sawyer     USE_SHMEM version for CRAY only; untested
!   99.06.01   Sawyer     USE_SHMEM version revised per comments from Tom
!   00.07.28   Sawyer     Implemented with shared memory arenas
!
!EOP
!-----------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER :: i, iscnt, ircnt, nr, pe, icnt, Nsize, Iam, Ierr
      INTEGER :: Status(MPI_STATUS_SIZE)
      Integer, allocatable :: Reqs(:), Stats(:)

      CPP_ENTER_PROCEDURE( "PAREXCHANGEVECTORREAL" )

      CALL MPI_COMM_SIZE( InComm, Nsize, Ierr )
      CALL MPI_COMM_RANK( InComm, Iam, Ierr )

      allocate (Reqs(Nsize))
      allocate (Stats(Nsize*MPI_STATUS_SIZE))

#if defined( MY_ALLTOALL )
      DO pe = 0, Nsize-1
!
! Send the individual buffers with non-blocking sends
!
        nr = LenInVector( pe + 1 )
        CALL MPI_ISEND( nr, 1, CPP_MPI_INTEGER, pe, Iam+3000,             &
                        InComm, Reqs( pe+1 ), Ierr )
      ENDDO
      DO pe = 0, Nsize - 1
!
! Receive the buffers with MPI_Recv. Now we are blocking.
!
        CALL MPI_RECV( nr, 1, CPP_MPI_INTEGER, pe, pe+3000,               &
                       InComm, Status, Ierr )
        LenOutVector(pe + 1) = nr
      ENDDO
      CALL MPI_WAITALL( Nsize, Reqs, Stats, Ierr )
#else
      CALL MPI_ALLTOALL( LenInVector, 1, CPP_MPI_INTEGER,                 &
                         LenOutVector, 1, CPP_MPI_INTEGER,                &
                         InComm, Ierr )
#endif
!
! Over all processes
!
      icnt = 1
      DO pe = 0, Nsize-1
!
! Send the individual buffers with non-blocking sends
!
        nr = LenInVector( pe + 1 )
        IF ( nr .gt. 0 ) THEN
          CALL MPI_ISEND( InVector( icnt ), nr,                           &
                          CPP_MPI_REAL8, pe, Iam+2000,                    &
                          InComm, Reqs( pe+1 ), Ierr )
        ELSE
          Reqs( pe+1 ) = MPI_REQUEST_NULL
        ENDIF
        icnt = icnt + nr
      ENDDO

!
! Over all processes
!
      icnt = 1
      DO pe = 0, Nsize - 1
!
! Receive the buffers with MPI_Recv. Now we are blocking. 
!
        nr = LenOutVector(pe + 1)
        IF ( nr .gt. 0 ) THEN
          CALL MPI_RECV( OutVector( icnt ), nr,                          &
                         CPP_MPI_REAL8, pe, pe+2000,                     &
                         InComm, Status, Ierr )
        ENDIF
        icnt = icnt + nr
      ENDDO
      CALL MPI_WAITALL( Nsize, Reqs, Stats, Ierr )

      deallocate (Reqs)
      deallocate (Stats)

      CPP_LEAVE_PROCEDURE( "PAREXCHANGEVECTORREAL" )

      RETURN
!EOC
      END SUBROUTINE ParExchangeVectorReal
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParExchangeVectorReal4 --- Exchange a sparse packed vector
!
! !INTERFACE:  

      SUBROUTINE ParExchangeVectorReal4 ( InComm, LenInVector, InVector,& 1
                                          LenOutVector, OutVector )

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )   :: InComm            ! Communicator
      INTEGER, INTENT( IN )   :: LenInVector( * )  ! Length on each PE
      REAL(CPP_REAL4), INTENT( IN ):: InVector( * ) ! The input buffer

! !OUTPUT PARAMETERS:
      INTEGER, INTENT( OUT )  :: LenOutVector( * ) ! Length on each PE
      REAL(CPP_REAL4), INTENT( OUT ) :: OutVector( * ) ! The output buffer

! !DESCRIPTION:
!
!     This routine exchanges vectors stored in compressed format, i.e.,
!     in so-called compressed sparse row (CSR) format, with other
!     PEs.  In essence it first exchanges the lengths with
!     MPI\_Alltoall, then the exchange of the actual vectors (can be
!     different in size) using MPI\_AlltoallV.  Since the latter is
!     inefficient, it is simulated using MPI\_Isend and MPI\_Recv.
!
! !SYSTEM ROUTINES:
!     MPI_ISEND, MPI_RECV, MPI_WAITALL, MPI_ALLTOALL
!
! !REVISION HISTORY:
!   98.03.17   Sawyer     Creation from F77 version
!   98.03.30   Sawyer     Removed assumed shape arrays due to problems
!   99.01.18   Sawyer     Added barrier for safety
!   99.03.08   Sawyer     USE_SHMEM version for CRAY only; untested
!   99.06.01   Sawyer     USE_SHMEM version revised per comments from Tom
!   00.07.28   Sawyer     Implemented with shared memory arenas
!
!EOP
!-----------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER :: i, iscnt, ircnt, nr, pe, icnt, Nsize, Iam, Ierr
      INTEGER :: Status(MPI_STATUS_SIZE)
      Integer, allocatable :: Reqs(:), Stats(:)

      CPP_ENTER_PROCEDURE( "PAREXCHANGEVECTORREAL4" )

      CALL MPI_COMM_SIZE( InComm, Nsize, Ierr )
      CALL MPI_COMM_RANK( InComm, Iam, Ierr )

      allocate (Reqs(Nsize))
      allocate (Stats(Nsize*MPI_STATUS_SIZE))

#if defined( MY_ALLTOALL )
      DO pe = 0, Nsize-1
!
! Send the individual buffers with non-blocking sends
!
        nr = LenInVector( pe + 1 )
        CALL MPI_ISEND( nr, 1, CPP_MPI_INTEGER, pe, Iam+3000,             &
                        InComm, Reqs( pe+1 ), Ierr )
      ENDDO
      DO pe = 0, Nsize - 1
!
! Receive the buffers with MPI_Recv. Now we are blocking.
!
        CALL MPI_RECV( nr, 1, CPP_MPI_INTEGER, pe, pe+3000,               &
                       InComm, Status, Ierr )
        LenOutVector(pe + 1) = nr
      ENDDO
      CALL MPI_WAITALL( Nsize, Reqs, Stats, Ierr )
#else
      CALL MPI_ALLTOALL( LenInVector, 1, CPP_MPI_INTEGER,                 &
                         LenOutVector, 1, CPP_MPI_INTEGER,                &
                         InComm, Ierr )
#endif
!
! Over all processes
!
      icnt = 1
      DO pe = 0, Nsize-1
!
! Send the individual buffers with non-blocking sends
!
        nr = LenInVector( pe + 1 )
        IF ( nr .gt. 0 ) THEN
          CALL MPI_ISEND( InVector( icnt ), nr,                           &
                          CPP_MPI_REAL4, pe, Iam+2000,                    &
                          InComm, Reqs( pe+1 ), Ierr )
        ELSE
          Reqs( pe+1 ) = MPI_REQUEST_NULL
        ENDIF
        icnt = icnt + nr
      ENDDO

!
! Over all processes
!
      icnt = 1
      DO pe = 0, Nsize - 1
!
! Receive the buffers with MPI_Recv. Now we are blocking. 
!
        nr = LenOutVector(pe + 1)
        IF ( nr .gt. 0 ) THEN
          CALL MPI_RECV( OutVector( icnt ), nr,                          &
                         CPP_MPI_REAL4, pe, pe+2000,                     &
                         InComm, Status, Ierr )
        ENDIF
        icnt = icnt + nr
      ENDDO
      CALL MPI_WAITALL( Nsize, Reqs, Stats, Ierr )

      deallocate (Reqs)
      deallocate (Stats)

      CPP_LEAVE_PROCEDURE( "PAREXCHANGEVECTORREAL4" )

      RETURN
!EOC
      END SUBROUTINE ParExchangeVectorReal4
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParExchangeVectorInt --- Exchange a sparse packed vector
!
! !INTERFACE:  

      SUBROUTINE ParExchangeVectorInt ( InComm, LenInVector, InVector,   & 10
                                         LenOutVector, OutVector )

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )   :: InComm            ! Communicator
      INTEGER, INTENT( IN )   :: LenInVector( * )  ! Length on each PE
      INTEGER, INTENT( IN )   :: InVector( * )     ! The input buffer

! !OUTPUT PARAMETERS:
      INTEGER, INTENT( OUT )  :: LenOutVector( * ) ! Length on each PE
      INTEGER, INTENT( OUT )  :: OutVector( * )    ! The output buffer

! !DESCRIPTION:
!
!     This routine exchanges vectors stored in compressed format, i.e.,
!     in so-called compressed sparse row (CSR) format, with other
!     PEs.  In essence it first exchanges the lengths with
!     MPI\_Alltoall, then the exchange of the actual vectors (can be
!     different in size) using MPI\_AlltoallV.  Since the latter is
!     inefficient, it is simulated using MPI\_Isend and MPI\_Recv.
!
! !SYSTEM ROUTINES:
!     MPI_ISEND, MPI_RECV, MPI_WAITALL, MPI_ALLTOALL
!
! !REVISION HISTORY:
!   98.03.17   Sawyer     Creation from F77 version
!   98.03.30   Sawyer     Removed assumed shape arrays due to problems
!   99.01.18   Sawyer     Added barrier for safety
!   99.03.08   Sawyer     USE_SHMEM version for CRAY only; untested
!   99.06.01   Sawyer     USE_SHMEM version revised per comments from Tom
!   00.07.28   Sawyer     Implemented with shared memory arenas
!
!EOP
!-----------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER :: i, iscnt, ircnt, nr, pe, icnt, Nsize, Iam, Ierr
      INTEGER :: Status(MPI_STATUS_SIZE)
      Integer, allocatable :: Reqs(:), Stats(:)

      CPP_ENTER_PROCEDURE( "PAREXCHANGEVECTORINT" )

      CALL MPI_COMM_SIZE( InComm, Nsize, Ierr )
      CALL MPI_COMM_RANK( InComm, Iam, Ierr )

      allocate (Reqs(Nsize))
      allocate (Stats(Nsize*MPI_STATUS_SIZE))

#if defined( MY_ALLTOALL )
      DO pe = 0, Nsize-1
!
! Send the individual buffers with non-blocking sends
!
        nr = LenInVector( pe + 1 )
        CALL MPI_ISEND( nr, 1,                                           &
                        MPI_INTEGER, pe, Iam+3000,                       &
                        InComm, Reqs( pe+1 ), Ierr )
      ENDDO
      DO pe = 0, Nsize - 1
!
! Receive the buffers with MPI_Recv. Now we are blocking.
!
        CALL MPI_RECV( nr, 1,                                                 &
                       MPI_INTEGER, pe, pe+3000,                              &
                       InComm, Status, Ierr )
        LenOutVector(pe + 1) = nr
      ENDDO
      CALL MPI_WAITALL( Nsize, Reqs, Stats, Ierr )
#else
      CALL MPI_ALLTOALL( LenInVector, 1, CPP_MPI_INTEGER,                     &
                         LenOutVector, 1, CPP_MPI_INTEGER,                    &
                         InComm, Ierr )
#endif
!
! Over all processes
!
      icnt = 1
      DO pe = 0, Nsize-1
!
! Send the individual buffers with non-blocking sends
!
        nr = LenInVector( pe + 1 )
        IF ( nr .gt. 0 ) THEN
          CALL MPI_ISEND( InVector( icnt ), nr,                               &
                          CPP_MPI_INTEGER, pe, Iam+2000,                      &
                          InComm, Reqs( pe+1 ), Ierr )
        ELSE
          Reqs( pe+1 ) = MPI_REQUEST_NULL
        ENDIF
        icnt = icnt + nr
      ENDDO

!
! Over all processes
!
      icnt = 1
      DO pe = 0, Nsize - 1
!
! Receive the buffers with MPI_Recv. Now we are blocking. 
!
        nr = LenOutVector(pe + 1)
        IF ( nr .gt. 0 ) THEN
          CALL MPI_RECV( OutVector( icnt ), nr,                               &
                         CPP_MPI_INTEGER, pe, pe+2000,                        &
                         InComm, Status, Ierr )
        ENDIF
        icnt = icnt + nr
      ENDDO
      CALL MPI_WAITALL( Nsize, Reqs, Stats, Ierr )
!
! WS 98.09.22 : This barrier needed to synchronize.  Why?
!
      CALL MPI_BARRIER( InComm, Ierr )

      deallocate (Reqs)
      deallocate (Stats)

      CPP_LEAVE_PROCEDURE( "PAREXCHANGEVECTORINT" )

      RETURN
!EOC
      END SUBROUTINE ParExchangeVectorInt
!-----------------------------------------------------------------------


!-----------------------------------------------------------------------
!BOP
! !ROUTINE: ParCollectiveBarrier --- Barrier: Simplest collective op.
!
! !INTERFACE:

      SUBROUTINE ParCollectiveBarrier( InComm ) 1

! !USES:
      IMPLICIT NONE
! !INPUT PARAMETERS:
      INTEGER, INTENT( IN ) :: InComm   ! Communicator

! !DESCRIPTION:
!
!     This routine performs a barrier only within the communicator InComm
!     
! !REVISION HISTORY:
!   00.09.10   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
      INTEGER Ierror

      CALL MPI_Barrier(InComm, Ierror )

      RETURN
!EOC
      END SUBROUTINE ParCollectiveBarrier
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !ROUTINE: ParCollective0D --- Perform global Collective of a scalar
!
! !INTERFACE:

      SUBROUTINE ParCollective0D( InComm, Op, Var ) 1

! !USES:
      IMPLICIT NONE
! !INPUT PARAMETERS:
      INTEGER, INTENT( IN ) :: InComm   ! Communicator
      INTEGER, INTENT( IN ) :: Op       ! Operation (see header)

! !INPUT/OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( INOUT ) :: Var  ! partial Var in, Var out

! !DESCRIPTION:
!
!     This utility makes a collective operation over all processes in 
!     communicator InComm.  
!     
! !REVISION HISTORY:
!   00.08.07   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
      INTEGER Ierror
      REAL(CPP_REAL8)    Tmp

      IF ( Op .EQ. BCSTOP ) THEN
        CALL MPI_BCAST( Var, 1, CPP_MPI_REAL8, 0, InComm, Ierror )
      ELSE
        CALL MPI_ALLREDUCE( Var, Tmp, 1, CPP_MPI_REAL8,                  &
                            Op, InComm, Ierror )
        Var = Tmp
      ENDIF

      RETURN
!EOC
      END SUBROUTINE ParCollective0D
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !ROUTINE: ParCollective1D --- Perform component-wise global Collective of a vector
!
! !INTERFACE:

      SUBROUTINE ParCollective1D( InComm, Op, Im, Var ) 1

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN ) :: InComm   ! Communicator
      INTEGER, INTENT( IN ) :: Op       ! Operation (see header)
      INTEGER, INTENT( IN ) :: Im       ! Size of 1-D array

! !INPUT/OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( INOUT ) :: Var(Im) ! partial Var in, Var out

! !DESCRIPTION:
!
!     This utility makes a collective operation over all processes in 
!     communicator InComm.  
!     
! !REVISION HISTORY:
!   00.08.07   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
      INTEGER Ierror
      REAL(CPP_REAL8)    Tmp(Im)

      IF ( Op .EQ. BCSTOP ) THEN
        CALL MPI_BCAST( Var, Im, CPP_MPI_REAL8, 0, InComm, Ierror )
      ELSE
        CALL MPI_ALLREDUCE( Var, Tmp, Im, CPP_MPI_REAL8,                 &
                            Op, InComm, Ierror )
        Var = Tmp
      ENDIF

      RETURN
!EOC
      END SUBROUTINE ParCollective1D
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !ROUTINE: ParCollective1DReal4 --- Perform component-wise global Collective of a vector
!
! !INTERFACE:

      SUBROUTINE ParCollective1DReal4( InComm, Op, Im, Var ) 1

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN ) :: InComm   ! Communicator
      INTEGER, INTENT( IN ) :: Op       ! Operation (see header)
      INTEGER, INTENT( IN ) :: Im       ! Size of 1-D array

! !INPUT/OUTPUT PARAMETERS:
      REAL(CPP_REAL4), INTENT( INOUT ) :: Var(Im) ! partial Var in, Var out

! !DESCRIPTION:
!
!     This utility makes a collective operation over all processes in 
!     communicator InComm.  
!     
! !REVISION HISTORY:
!   00.08.07   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
      INTEGER Ierror
      REAL(CPP_REAL4)    Tmp(Im)

      IF ( Op .EQ. BCSTOP ) THEN
        CALL MPI_BCAST( Var, Im, CPP_MPI_REAL4, 0, InComm, Ierror )
      ELSE
        CALL MPI_ALLREDUCE( Var, Tmp, Im, CPP_MPI_REAL4,                 &
                            Op, InComm, Ierror )
        Var = Tmp
      ENDIF

      RETURN
!EOC
      END SUBROUTINE ParCollective1DReal4
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !ROUTINE: ParCollective2D --- Perform component-wise collective operation
!
! !INTERFACE:

      SUBROUTINE ParCollective2D( InComm, Op, Im, Jm, Var ) 1

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN ) :: InComm     ! Communicator
      INTEGER, INTENT( IN ) :: Op         ! Operation (see header)
      INTEGER, INTENT( IN ) :: Im         ! First dimension of 2-D array
      INTEGER, INTENT( IN ) :: Jm         ! Second dimension of 2-D array

! !INPUT/OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( INOUT ) :: Var(Im,Jm) ! partial Var in, Var out

! !DESCRIPTION:
!
!     This utility makes a collective operation over all processes in 
!     communicator InComm.  
!     
! !REVISION HISTORY:
!   00.08.07   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
      INTEGER Ierror
      REAL(CPP_REAL8)    Tmp(Im,Jm)

      IF ( Op .EQ. BCSTOP ) THEN
        CALL MPI_BCAST( Var, Im*Jm, CPP_MPI_REAL8, 0, InComm, Ierror )
      ELSE
        CALL MPI_ALLREDUCE( Var, Tmp, Im*Jm, CPP_MPI_REAL8,              &
                            Op, InComm, Ierror )
        Var = Tmp
      ENDIF

      RETURN
!EOC
      END SUBROUTINE ParCollective2D
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !ROUTINE: ParCollective2DReal4 --- Perform component-wise collective operation
!
! !INTERFACE:

      SUBROUTINE ParCollective2DReal4( InComm, Op, Im, Jm, Var ) 1

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN ) :: InComm     ! Communicator
      INTEGER, INTENT( IN ) :: Op         ! Operation (see header)
      INTEGER, INTENT( IN ) :: Im         ! First dimension of 2-D array
      INTEGER, INTENT( IN ) :: Jm         ! Second dimension of 2-D array

! !INPUT/OUTPUT PARAMETERS:
      REAL(CPP_REAL4), INTENT( INOUT ) :: Var(Im,Jm) ! partial Var in, Var out

! !DESCRIPTION:
!
!     This utility makes a collective operation over all processes in 
!     communicator InComm.  
!     
! !REVISION HISTORY:
!   00.08.07   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
      INTEGER Ierror
      REAL(CPP_REAL4)    Tmp(Im,Jm)

      IF ( Op .EQ. BCSTOP ) THEN
        CALL MPI_BCAST( Var, Im*Jm, CPP_MPI_REAL4, 0, InComm, Ierror )
      ELSE
        CALL MPI_ALLREDUCE( Var, Tmp, Im*Jm, CPP_MPI_REAL4,              &
                            Op, InComm, Ierror )
        Var = Tmp
      ENDIF

      RETURN
!EOC
      END SUBROUTINE ParCollective2DReal4
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !ROUTINE: ParCollective3D --- Perform component-wise global Collective of a vector
!
! !INTERFACE:

      SUBROUTINE ParCollective3D( InComm, Op, Im, Jm, Lm, Var ) 4

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN ) :: InComm     ! Communicator
      INTEGER, INTENT( IN ) :: Op         ! Operation (see header)
      INTEGER, INTENT( IN ) :: Im         ! First dimension of 3-D array
      INTEGER, INTENT( IN ) :: Jm         ! Second dimension of 3-D array
      INTEGER, INTENT( IN ) :: Lm         ! Third dimension of 3-D array

! !INPUT/OUTPUT PARAMETERS:
      REAL(CPP_REAL8), INTENT( INOUT ):: Var(Im,Jm,LM) ! partial Var in, Var out

! !DESCRIPTION:
!
!     This utility makes a collective operation over all processes in 
!     communicator InComm.  
!     
! !REVISION HISTORY:
!   00.08.07   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
      INTEGER Ierror
      REAL(CPP_REAL8) Tmp(Im,Jm,Lm)

      IF ( Op .EQ. BCSTOP ) THEN
        CALL MPI_BCAST( Var, Im*Jm*Lm, CPP_MPI_REAL8, 0, InComm, Ierror )
      ELSE
        CALL MPI_ALLREDUCE( Var, Tmp, Im*Jm*Lm, CPP_MPI_REAL8,           &
                            Op, InComm, Ierror )
        Var = Tmp
      ENDIF

      RETURN
!EOC
      END SUBROUTINE ParCollective3D
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !ROUTINE: ParCollective0DInt --- Perform global Collective of a scalar
!
! !INTERFACE:

      SUBROUTINE ParCollective0DInt( InComm, Op, Var ) 1

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN ) :: InComm   ! Communicator
      INTEGER, INTENT( IN ) :: Op       ! Operation (see header)

! !INPUT/OUTPUT PARAMETERS:
      INTEGER, INTENT( INOUT ) :: Var   ! partial Var in, Var out

! !DESCRIPTION:
!
!     This utility makes a collective operation over all processes in 
!     communicator InComm.  
!     
! !REVISION HISTORY:
!   00.08.07   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
      INTEGER Ierror
      INTEGER    Tmp

      IF ( Op .EQ. BCSTOP ) THEN
        CALL MPI_BCAST( Var, 1, CPP_MPI_INTEGER, 0, InComm, Ierror )
      ELSE
        CALL MPI_ALLREDUCE( Var,Tmp,1,CPP_MPI_INTEGER,Op,InComm,Ierror )
        Var = Tmp
      ENDIF

      RETURN
!EOC
      END SUBROUTINE ParCollective0DInt
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !ROUTINE: ParCollective0DStr --- Perform global Collective of a string
!
! !INTERFACE:

      SUBROUTINE ParCollective0DStr( InComm, Op, Var ) 1

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN ) :: InComm   ! Communicator
      INTEGER, INTENT( IN ) :: Op       ! Operation (see header)

! !INPUT/OUTPUT PARAMETERS:
      CHARACTER (LEN=*), INTENT( INOUT ) :: Var ! partial Var in, Var out

! !DESCRIPTION:
!
!     This utility makes a collective operation over all processes in
!     communicator InComm.
!
! !REVISION HISTORY:
!   00.08.07   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
      INTEGER Ierror, StrLen

      StrLen = LEN(Var)
      IF ( Op .EQ. BCSTOP ) THEN
        CALL MPI_BCAST( Var, StrLen, MPI_CHARACTER, 0, InComm, Ierror )
      ELSE
        write(iulog,*) "global reduction of string not supported"
      ENDIF

      RETURN
!EOC
      END SUBROUTINE ParCollective0DStr
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !ROUTINE: ParCollective1DStr --- Perform global Collective of a string
!
! !INTERFACE:

      SUBROUTINE ParCollective1DStr( InComm, Op, Im, Var ) 1

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN ) :: InComm   ! Communicator
      INTEGER, INTENT( IN ) :: Op       ! Operation (see header)
      INTEGER, INTENT( IN ) :: Im       ! Size of 1-D array

! !INPUT/OUTPUT PARAMETERS:
      CHARACTER (LEN=*), INTENT( INOUT ) :: Var(:)   ! partial Var in, Var out

! !DESCRIPTION:
!
!     This utility makes a collective operation over all processes in 
!     communicator InComm.  
!     
! !REVISION HISTORY:
!   00.08.07   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
      INTEGER Ierror, StrLen

      StrLen = LEN(Var(1))
      IF ( Op .EQ. BCSTOP ) THEN
        CALL MPI_BCAST( Var, Im*StrLen, MPI_CHARACTER, 0, InComm, Ierror )
      ELSE
        write(iulog,*) "global reduction of string not supported"
      ENDIF

      RETURN
!EOC
      END SUBROUTINE ParCollective1DStr
!-----------------------------------------------------------------------

!-----------------------------------------------------------------------
!BOP
! !ROUTINE: ParCollective1DInt --- Perform component-wise global 
!                                  collective operations of int vector
!
! !INTERFACE:

      SUBROUTINE ParCollective1DInt( InComm, Op, Im, Var ) 1

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN ) :: InComm   ! Communicator
      INTEGER, INTENT( IN ) :: Op       ! Operation (see header)
      INTEGER, INTENT( IN ) :: Im       ! Size of 1-D array

! !INPUT/OUTPUT PARAMETERS:
      INTEGER, INTENT( INOUT ) :: Var(Im) ! partial Var in, Var out

! !DESCRIPTION:
!
!     This utility makes a collective operation over all processes in 
!     communicator InComm.  
!     
! !REVISION HISTORY:
!   00.08.07   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
      INTEGER Ierror
      INTEGER Tmp(Im)

      IF ( Op .EQ. BCSTOP ) THEN
        CALL MPI_BCAST( Var, Im, CPP_MPI_INTEGER, 0, InComm, Ierror )
      ELSE
        CALL MPI_ALLREDUCE( Var,Tmp,Im,CPP_MPI_INTEGER,Op,InComm,Ierror )
        Var = Tmp
      ENDIF

      RETURN
!EOC
      END SUBROUTINE ParCollective1DInt
!-----------------------------------------------------------------------
!-----------------------------------------------------------------------
!BOP
! !ROUTINE: ParCollective2DInt --- Perform component-wise collective op.
!
! !INTERFACE:

      SUBROUTINE ParCollective2DInt( InComm, Op, Im, Jm, Var ) 1

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN ) :: InComm     ! Communicator
      INTEGER, INTENT( IN ) :: Op         ! Operation (see header)
      INTEGER, INTENT( IN ) :: Im         ! First dimension of 2-D array
      INTEGER, INTENT( IN ) :: Jm         ! Second dimension of 2-D array

! !INPUT/OUTPUT PARAMETERS:
      INTEGER, INTENT( INOUT ):: Var(Im,Jm) ! partial Var in, Var out

! !DESCRIPTION:
!
!     This utility makes a collective operation over all processes in 
!     communicator InComm.  
!     
! !REVISION HISTORY:
!   00.08.07   Sawyer     Creation
!
!EOP
!---------------------------------------------------------------------
!BOC
      INTEGER Ierror
      INTEGER Tmp(Im,Jm)

      IF ( Op .EQ. BCSTOP ) THEN
        CALL MPI_BCAST( Var, Im*Jm, CPP_MPI_INTEGER, 0, InComm, Ierror )
      ELSE
        CALL MPI_ALLREDUCE( Var, Tmp, Im*Jm, CPP_MPI_INTEGER,           &
                            Op, InComm, Ierror )
        Var = Tmp
      ENDIF

      RETURN
!EOC
      END SUBROUTINE ParCollective2DInt
!-----------------------------------------------------------------------
# ifdef _SMEMORY
!-----------------------------------------------------------------------
!BOP
! !IROUTINE: ParExchangeLength --- Exchange a sparse packed vector
!
! !INTERFACE:  

      SUBROUTINE ParExchangeLength ( InComm, LenInVector, LenOutVector) 4

! !USES:
      IMPLICIT NONE

! !INPUT PARAMETERS:
      INTEGER, INTENT( IN )   :: InComm            ! Communicator
      INTEGER, INTENT( IN )   :: LenInVector( * )  ! Length on each PE

! !OUTPUT PARAMETERS:
      INTEGER, INTENT( OUT )  :: LenOutVector( * ) ! Length on each PE

! !DESCRIPTION:
!
!     This routine exchanges vectors stored in compressed format, i.e.,
!     in so-called compressed sparse row (CSR) format, with other
!     PEs.  In essence it first exchanges the lengths with
!     MPI\_Alltoall, then the exchange of the actual vectors (can be
!     different in size) using MPI\_AlltoallV.  Since the latter is
!     inefficient, it is simulated using MPI\_Isend and MPI\_Recv.
!
! !SYSTEM ROUTINES:
!     MPI_ISEND, MPI_RECV, MPI_WAITALL, MPI_ALLTOALL
!
! !REVISION HISTORY:
!   98.03.17   Sawyer     Creation from F77 version
!   98.03.30   Sawyer     Removed assumed shape arrays due to problems
!   99.01.18   Sawyer     Added barrier for safety
!   99.03.08   Sawyer     USE_SHMEM version for CRAY only; untested
!   99.06.01   Sawyer     USE_SHMEM version revised per comments from Tom
!   00.07.28   Sawyer     Implemented with shared memory arenas
!
!EOP
!-----------------------------------------------------------------------
!BOC
!
! !LOCAL VARIABLES:
      INTEGER :: i, iscnt, ircnt, nr, pe, icnt, Nsize, Iam, Ierr
      INTEGER :: Status(MPI_STATUS_SIZE)
      Integer, allocatable :: Reqs(:), Stats(:)

      CPP_ENTER_PROCEDURE( "PAREXCHANGELENGTH" )

      CALL MPI_COMM_SIZE( InComm, Nsize, Ierr )
      CALL MPI_COMM_RANK( InComm, Iam, Ierr )

      allocate (Reqs(Nsize))
      allocate (Stats(Nsize*MPI_STATUS_SIZE))

#if defined( MY_ALLTOALL )
      DO pe = 0, Nsize-1
!
! Send the individual buffers with non-blocking sends
!
        nr = LenInVector( pe + 1 )
        CALL MPI_ISEND( nr, 1,                                           &
                        MPI_INTEGER, pe, Iam+3000,                       &
                        InComm, Reqs( pe+1 ), Ierr )
      ENDDO
      DO pe = 0, Nsize - 1
!
! Receive the buffers with MPI_Recv. Now we are blocking.
!
        CALL MPI_RECV( nr, 1,                                                 &
                       MPI_INTEGER, pe, pe+3000,                              &
                       InComm, Status, Ierr )
        LenOutVector(pe + 1) = nr
      ENDDO
      CALL MPI_WAITALL( Nsize, Reqs, Stats, Ierr )

      deallocate (Reqs)
      deallocate (Stats)

#else
      CALL MPI_ALLTOALL( LenInVector, 1, CPP_MPI_INTEGER,                     &
                         LenOutVector, 1, CPP_MPI_INTEGER,                    &
                         InComm, Ierr )
#endif
      CALL MPI_BARRIER( InComm, Ierr )


      CPP_LEAVE_PROCEDURE( "PAREXCHANGELENGTH" )

      RETURN
!EOC
      END SUBROUTINE ParExchangeLength
!-----------------------------------------------------------------------
!-----------------------------------------------------------------------
!BOP
! !IROUTINE:   ParCalcInfoDecompToGhost --- calculates info about the pattern
!
! !INTERFACE:

      subroutine ParCalcInfoDecompToGhost(InComm, DA,GB,Info) 1,6
!
! !USES:
      USE decompmodule, ONLY : DecompType,DecompInfo,DecompGlobalToLocal
      USE ghostmodule, ONLY : GhostType,GhostInfo
      IMPLICIT NONE

! !INPUT PARAMETERS:
      integer, intent(in)           :: InComm ! communicator
      type(DecompType), intent(in)  :: DA   ! Source Decomp Desc
      type(GhostType) , intent(in)  :: GB   ! Destination Ghost Desc

! !OUTPUT PARAMETERS:
      type (ParInfoType), intent(out) :: Info  ! Info structure
!
! !DESCRIPTION:
!     This routine calulcates the information about a communication 
!     pattern that transforms from one decomposition to another, 
!     i.e., a so-called "transpose".  This is a copy of an algorithm 
!     from the ParPatternDecompToGhost subroutine.
!
! !SYSTEM ROUTINES:
!    MPI_COMM_SIZE, MPI_COMM_RANK, MPI_ALLREDUCE
!
! !REVISION HISTORY:
!   07.09.04   Dennis     Creation based on algorithm in ParPatternDecompToGhost
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      integer :: nTags,oldpe,oldlocal,sTag,eTag,nCount
      integer :: j,pe,local,tag,ierr,iam,npes
      integer :: npesA,npesB,tmpA,tmp1B,tmp2B,tmp3B
      integer, allocatable :: sCount(:),rCount(:)
  
      call DecompInfo(DA,npesA,tmpA)
      call GhostInfo(GB,npesB,tmp1B,tmp2B,tmp3B)

      call MPI_COMM_SIZE(InComm,npes,ierr)
      call MPI_COMM_RANK(InComm,iam,ierr)

      allocate(sCount(npes),rCount(npes))
      sCount=0
      rCount=0
      if(iam .lt. npesB)  then 
! Parse through all the tags in the local segment
        nTags = SIZE(GB%Local%Head(iam+1)%StartTags)
        do j=1,nTags
          oldpe = -1
          oldlocal = 0 
          sTag = GB%Local%Head(iam+1)%StartTags(j)
          eTag = GB%Local%Head(iam+1)%EndTags(j)
          do tag = sTag,eTag
            if(tag > 0) then 
!
! Determine the index and PE of this entry on A. This might be inlined later
!
              call DecompGlobalToLocal(DA,tag,Local,Pe)
!
! If ipe-1 is my id, then this is an entry ipe will receive from Pe
!
              if( pe /= oldpe .or. local /= oldlocal+1) then 
                sCount(pe+1) = sCount(pe+1) + 1
              endif
              oldpe = pe  ! Update PE
              oldlocal = local  ! Update local index
            endif
          enddo
        enddo
      endif
      
! Calculate the length of receive segments
      call ParExchangeLength(InComm,sCount,rCount)
!  Record some information 
      Info%numSendSeg   = SUM(sCount)
      InFo%numSendNeigh = COUNT(sCount > 0) 

      Info%numRecvSeg   = SUM(rCount)
      InFo%numRecvNeigh = COUNT(rCount > 0) 
      nCount=MAX(Info%numSendSeg,Info%numRecvSeg)
      call MPI_ALLREDUCE(nCount,Info%maxNumSeg,1,INT4,MPI_MAX,InComm,ierr)

      deallocate(sCount,rCount)

      CPP_LEAVE_PROCEDURE( "PARCALCLENGTHDECOMPTOGHOST" )
      RETURN
!EOC
      end subroutine ParCalcInfoDecompToGhost
!-----------------------------------------------------------------------
!-----------------------------------------------------------------------
!BOP  
! !IROUTINE:   ParCalcInfoDecompToDecomp --- calculates info about the pattern
!
! !INTERFACE:

      subroutine ParCalcInfoDecompToDecomp(InComm, DA,DB,Info) 1,5
!     
! !USES:
      USE decompmodule, ONLY : DecompType,DecompInfo,DecompGlobalToLocal
      IMPLICIT NONE

! !INPUT PARAMETERS:
      integer, intent(in)           :: InComm  ! communicator
      type(DecompType), intent(in)  :: DA      ! Source Decomp Desc
      type(DecompType), intent(in)  :: DB      ! Destination Decomp Desc

! !OUTPUT PARAMETERS:
      type (ParInfoType), intent(out) :: Info  ! Info structure
!           
! !DESCRIPTION:
!     This routine calulcates the information about a communication
!     pattern that transforms from one decomposition to another,
!     i.e., a so-called "transpose".  This is a copy of an algorithm
!     from the ParPatternDecompToDecomp subroutine.
!
! !SYSTEM ROUTINES:
!    MPI_COMM_SIZE, MPI_COMM_RANK, MPI_ALLREDUCE
!
! !REVISION HISTORY:
!   07.09.04   Dennis     Creation based on algorithm in ParPatternDecompToDecomp
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      integer :: nCount,npes,iam,ierr
      integer :: nTags,oldpe,oldlocal,sTag,eTag
      integer :: j,pe,local,tag,tmpA,tmpB,npesA,npesB
      integer, allocatable :: sCount(:),rCount(:)

      call DecompInfo(DA,npesA,tmpA)
      call DecompInfo(DB,npesB,tmpB)

      call MPI_COMM_SIZE(InComm,npes,ierr)
      call MPI_COMM_RANK(InComm,iam,ierr)
  
      allocate(sCount(npes),rCount(npes))
      sCount=0
      rCount=0
      if(iam .lt. npesB)  then
! Parse through all the tags in the local segment
        nTags = SIZE(DB%Head(iam+1)%StartTags)
        do j=1,nTags
          oldpe = -1
          sTag = DB%Head(iam+1)%StartTags(j)
          eTag = DB%Head(iam+1)%EndTags(j)
          do tag = sTag,eTag
!
! Determine the index and PE of this entry on A. This might be inlined later
!
            call DecompGlobalToLocal(DA,tag,Local,Pe)
!
! If ipe-1 is my id, then this is an entry ipe will receive from Pe
!
            if( pe /= oldpe ) then 
              oldpe = pe
              sCount(pe+1) = sCount(pe+1) + 1
            endif
          enddo
        enddo
      endif
! Calculate the length of recieve segments    
      call ParExchangeLength(InComm,sCount,rCount)      
!  Record some information
      Info%numSendSeg   = SUM(sCount)
      InFo%numSendNeigh = COUNT(sCount > 0)

      Info%numRecvSeg   = SUM(rCount)
      InFo%numRecvNeigh = COUNT(rCount > 0)
      nCount=MAX(Info%numSendSeg,Info%numRecvSeg)
      call MPI_ALLREDUCE(nCount,Info%maxNumSeg,1,INT4,MPI_MAX,InComm,ierr)
  
      deallocate(sCount,rCount)

      CPP_LEAVE_PROCEDURE( "PARCALCINFODECOMPTODECOMP" )
      RETURN
!EOC
      end subroutine ParCalcInfoDecompToDecomp
!--------------------------------------------------------------------------------------------------------
!-----------------------------------------------------------------------
!BOP
! !IROUTINE:   ParCalcInfoGhostToDecomp --- calculates info about the pattern
!
! !INTERFACE:

      subroutine ParCalcInfoGhostToDecomp(InComm, GA,DB,Info) 1,6
!
! !USES:
      USE decompmodule, ONLY : DecompType,DecompInfo,DecompGlobalToLocal
      USE ghostmodule, ONLY : GhostType,GhostInfo
      IMPLICIT NONE

! !INPUT PARAMETERS:
      integer, intent(in)           :: InComm  ! communicator
      type(GhostType), intent(in)   :: GA      ! Source Ghost Desc
      type(DecompType), intent(in)  :: DB      ! Destination Decomp Desc

! !OUTPUT PARAMETERS:
      type (ParInfoType), intent(out) :: Info  ! Info structure
!
! !DESCRIPTION:
!     This routine calulcates the information about a communication
!     pattern that transforms from one decomposition to another,
!     i.e., a so-called "transpose".  This is a copy of an algorithm
!     from the ParPatternGhostToDecomp subroutine.
!
! !SYSTEM ROUTINES:
!    MPI_COMM_SIZE, MPI_COMM_RANK, MPI_ALLREDUCE
!
! !REVISION HISTORY:
!   07.09.04   Dennis     Creation based on algorithm in ParPatternGhostToDecomp
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      integer :: nTags,oldpe,oldlocal,sTag,eTag
      integer :: npes, nCount,iam,ierr
      integer :: j,pe,local,tag,npesA,npesB,tmpB,tmp1A,tmp2A,tmp3A
      integer, allocatable :: sCount(:),rCount(:)

      call GhostInfo(GA,npesA,tmp1A,tmp2A,tmp3A)
      call DecompInfo(DB,npesB,tmpB)

      call MPI_COMM_SIZE(InComm,npes,ierr)
      call MPI_COMM_RANK(InComm,iam,ierr)
  
      allocate(sCount(npes),rCount(npes))
      sCount=0
      rCount=0
      if(iam .lt. npesB) then 
! Parse through all the tags in the local segment
        nTags = SIZE(DB%Head(iam+1)%StartTags)
        do j=1,nTags
          oldpe = -1
          oldlocal = 0
          sTag = DB%Head(iam+1)%StartTags(j)
          eTag = DB%Head(iam+1)%EndTags(j)
          do tag = sTag,eTag
!
! Determine the index and PE of this entry on A. This might be inlined later
!
            call DecompGlobalToLocal(GA%Decomp,tag,Local,Pe)
!
! If ipe-1 is my id, then this is an entry ipe will receive from Pe
!
            if( pe /= oldpe .or. local /= OldLocal+1 ) then 
              sCount(pe+1) = sCount(pe+1) + 1
            endif
            oldpe = pe
            oldlocal = local
          enddo
        enddo
      endif
! Calculate the lenght of recieve segments
      call ParExchangeLength(InComm,sCount,rCount)
!  Record some information
      Info%numSendSeg   = SUM(sCount)
      InFo%numSendNeigh = COUNT(sCount > 0)

      Info%numRecvSeg   = SUM(rCount)
      InFo%numRecvNeigh = COUNT(rCount > 0)
      nCount=MAX(Info%numSendSeg,Info%numRecvSeg)
      call MPI_ALLREDUCE(nCount,Info%maxNumSeg,1,INT4,MPI_MAX,InComm,ierr)

      deallocate(sCount,rCount)

      CPP_LEAVE_PROCEDURE( "PARCALCLENGTHGHOSTTODECOMP" )
      RETURN
!EOC
      end subroutine ParCalcInfoGhostToDecomp
!-----------------------------------------------------------------------
!-----------------------------------------------------------------------
!BOP
! !IROUTINE:   ParCalcInfoGhostToGhost --- calculates info about the pattern
!
! !INTERFACE:

      subroutine ParCalcInfoGhostToGhost(InComm, GA,GB,Info) 1,6
!
! !USES:
      USE decompmodule, ONLY : DecompGlobalToLocal
      USE ghostmodule, ONLY  : GhostType,GhostInfo
      IMPLICIT NONE

! !INPUT PARAMETERS:
      integer, intent(in)           :: InComm ! communicator
      type(GhostType), intent(in)   :: GA     ! Source Ghost Desc
      type(GhostType), intent(in)   :: GB     ! Destination Ghost Desc

! !OUTPUT PARAMETERS:
      type (ParInfoType), intent(out) :: Info  ! Info structure
!
! !DESCRIPTION:
!     This routine calulcates the information about a communication
!     pattern that transforms from one decomposition to another,
!     i.e., a so-called "transpose".  This is a copy of an algorithm
!     from the ParPatternGhostToGhost subroutine.
!
! !SYSTEM ROUTINES:
!    MPI_COMM_SIZE, MPI_COMM_RANK, MPI_ALLREDUCE
!
! !REVISION HISTORY:
!   07.09.04   Dennis     Creation based on algorithm in ParPatternGhostToGhost
!
!EOP
!-----------------------------------------------------------------------
!BOC
! !LOCAL VARIABLES:
      integer :: nTags,oldpe,oldlocal,sTag,eTag,ierr,nCount
      integer :: j,pe,local,tag,npes,iam,npesA,npesB
      integer :: tmp1A,tmp2A,tmp3A,tmp1B,tmp2B,tmp3B
      integer, allocatable :: sCount(:),rCount(:)

      call GhostInfo(GA,npesA,tmp1A,tmp2A,tmp3A)
      call GhostInfo(GB,npesB,tmp1B,tmp2B,tmp3B)

      call MPI_COMM_SIZE(InComm,npes,ierr)
      call MPI_COMM_RANK(InComm,iam,ierr)
  
      allocate(sCount(npes),rCount(npes))
      sCount=0
      rCount=0 
      if(iam .lt. npesB) then 
! Parse through all the tags in the local segment
        nTags = SIZE(GB%Local%Head(iam+1)%StartTags)
        do j=1,nTags
          oldpe = -1
          oldlocal = 0
          sTag = GB%Local%Head(iam+1)%StartTags(j)
          eTag = GB%Local%Head(iam+1)%EndTags(j)
          do tag = sTag,eTag
            if (Tag > 0 ) THEN 
!
! Determine the index and PE of this entry on A. This might be inlined later
!
              call DecompGlobalToLocal(GA%Decomp,tag,Local,Pe)
!
! If ipe-1 is my id, then this is an entry ipe will receive from Pe
!
              if( pe /= oldpe .or. local /= OldLocal+1 ) then 
                sCount(pe+1)=sCount(pe+1)+1
              endif
              oldpe = pe
              oldlocal = local
            endif
          enddo
        enddo
      endif

! Calculate the length of receive segments
      call ParExchangeLength(InComm,sCount,rCount)
!  Record some information
      Info%numSendSeg   = SUM(sCount)
      InFo%numSendNeigh = COUNT(sCount > 0)

      Info%numRecvSeg   = SUM(rCount)
      InFo%numRecvNeigh = COUNT(rCount > 0)
      nCount=MAX(Info%numSendSeg,Info%numRecvSeg)
      call MPI_ALLREDUCE(nCount,Info%maxNumSeg,1,INT4,MPI_MAX,InComm,ierr)

      deallocate(sCount,rCount)

      CPP_LEAVE_PROCEDURE( "PARCALCINFOGHOSTTOGHOST" )
      RETURN
!EOC
      end subroutine ParCalcInfoGhostToGhost
# endif
#endif
      END MODULE parutilitiesmodule