#define MOD_ASSUMED_SIZE !#define MOD_SPECIFIED_SHAPE #if !defined( STAND_ALONE ) #define NOR4 ! Activate to effectively eliminate real*4 window #endif !BOP ! ! !MODULE: mod_comm --- SPMD parallel decompostion/communication module module mod_comm 44,2 ! ! !DESCRIPTION: ! ! \paragraph{Overview} ! ! This module contains SPMD parallelism decomposition and ! communication routines. This library was originally written by ! W. Putman and S.-J. Lin for simple gridded communications in the ! Finite-Volume General Circulation Model (FVGCM). Most of the ! member functions are specific to the type of gridded data, ! ghost communication and decompositions used in FVGCM (which ! are, however, very common in atmospheric models). ! ! The module was extended for irregular communication ! by W. Sawyer and A. Mirin. It is now ! a more general tool and has been incorporated into the Parallel ! Library for Grid Manipulations (PILGRIM) which is used in the ! Community Atmospheric Model (CAM) and the Physical-space ! Statistical Analysis System (PSAS). ! ! ********************************************************** ! The storage associated with the irregular communications ! is based on CAM requirements. It runs the risk of functioning ! improperly when used in another code. ! ********************************************************** ! ! Irregular communication is based on the {\tt blockdescriptor} ! derived type, which defines a set of parcels which are to be ! send to (or received from) another PE. The irregular ! communication routines operate on arrays of block descriptors ! whose length is equal to number of PEs involved in the ! communication. This means the irregular communication primitives ! are merely non-blocking (potentially) all-to-all primitives. ! ! This package is based on standard MPI-1 communications, and OpenMP ! may be implemented. MPI-2 and SHMEM support have been removed. ! ! ! \paragraph{Use of Global Arrays} ! ! The module uses the concept of global arrays (coined from former ! usage of shared memory arenas in the "multi-level parallelism" ! (MLP) paradigm). Global arrays are merely buffers into which ! data are packed for the transfer to other PEs and are not ! necessarily of global extent. All such arrays are ! 1-dimensional; they are accessed as needed with offset vars. ! ! \paragraph{Use of Windows} ! ! All implementations use real*8, real*4, and integer*4 windows ! which are used with global arrays as follows: ! ! \begin{itemize} ! \item r8\_win -> ga\_r8 - for use with real*8 types ! \item r4\_win -> ga\_r4 - for use with real*4 types ! \item i4\_win -> ga\_i4 - for use with integer*4 types ! \end{itemize} ! ! note: MPI routines need 2 buffers per GA, ga\_<type>\_s & ga\_<type>\_r ! ga\_<type>\_r is used for the windows ! ! \paragraph{Compilation} ! ! This module contains several precompile options: ! ! \begin{itemize} ! \item {\tt STAND_ALONE}: Use as stand-alone library (if ! defined) or as part of CAM (if ! undefined) ! \item {\tt MODCM_TIMING}: Turn on CAM timing routines (only ! available if compiled in CAM framework) ! \item {\tt _OPENMP}: Implicit token (controlled by ! compiler) to enable OpenMP ! \end{itemize} ! ! ! \paragraph{Usage} ! ! NOTE - must call PILGRIM routine parinit to initialize before ! making any other calls. ! ! The public members of this module are: ! ! \begin{itemize} ! \item {\tt mp\_init}: Initialize module ! \item {\tt mp\_exit}: Exit module ! \item {\tt mp\_send4d\_ns}: Ghost 4D array on north/south ! \item {\tt mp\_recv4d\_ns}: Complete 4D N/S ghost operation ! \item {\tt mp\_send2\_ns}: Ghost 2 3D arrays on north/south ! \item {\tt mp\_recv2\_ns}: Complete 2x3D N/S ghost operation ! \item {\tt mp\_send3d}: Send 3D general ghost region ! \item {\tt mp\_recv3d}: Complete 3D general ghost operation ! \item {\tt mp\_send3d\_2}: Send 2x3D general ghost regions ! \item {\tt mp\_recv3d\_2}: Complete 2x3D general ghost operation ! \item {\tt get\_partneroffset}:Offset for remote write ! \item {\tt mp\_sendirr}: Initiate all-to-all send of parcels ! \item {\tt mp\_recvirr}: Complete all-to-all chunk commun. ! \end{itemize} ! ! There are variants of some of these routines for r4 and i4 data types. ! There are other public routines, but these are only used internally ! in PILGRIM, and they should not be called by user applications. ! ! !REVISION HISTORY: ! 2001.09.01 Lin ! 2002.04.16 Putman Modified for Global Array code ! 2002.04.16 Putman Added ProTeX documentation ! 2002.05.28 Putman Added use of precision module ! 2003.06.24 Sawyer Minor additions for use with mod_irreg ! 2004.01.08 Sawyer Removed older functionality, no longer needed ! 2004.02.10 Mirin Major restructuring and simplification. Documentation ! 2004.03.06 Sawyer Additional documentation; cosmetics ! 2005.03.20 Sawyer Added extensive support for real*4 ! 2005.10.12 Worley Improved vectorization of buffer copies and general clean-up ! 2006.05.15 Mirin Make dynamic allocation the default; general clean-up. ! !USES: #if defined( STAND_ALONE ) # define iulog 6 #else use cam_logfile, only: iulog #endif ! ! Mod_comm has option for stand-alone use as well as within CAM ! #if defined ( SPMD ) #if defined( STAND_ALONE ) # define r8 selected_real_kind(12) # define r4 selected_real_kind( 6) # define i8 selected_int_kind(13) # define i4 selected_int_kind( 6) # define PLON 144 # define PLAT 91 # define PLEV 26 # define PCNST 1 #else use shr_kind_mod, only : r8 => shr_kind_r8, r4 => shr_kind_r4, & i8 => shr_kind_i8, i4 => shr_kind_i4 #endif #if defined( MODCM_TIMING ) use perf_mod #endif implicit none #include "mpif.h" ! !PUBLIC MEMBER FUNCTIONS: public mp_init, mp_exit, & mp_send4d_ns, mp_recv4d_ns, mp_send4d_ns_r4, mp_recv4d_ns_r4, & mp_send2_ns, mp_recv2_ns, mp_send3d_2, mp_recv3d_2, & mp_send3d, mp_recv3d, mp_sendirr, mp_recvirr, & mp_sendirr_r4, mp_recvirr_r4, mp_sendirr_i4, mp_recvirr_i4, & mp_swapirr, mp_swapirr_i4, mp_barrier, & get_partneroffset, mp_r8, mp_r4, mp_i4, & mp_sendtrirr, mp_recvtrirr, mp_swaptrirr public modcam_method, modcam_geopk, modcam_gatscat, modcam_npryz, modcam_maxirr ! !PRIVATE MEMBER FUNCTIONS: private ceil2 ! copy of routine in atm/cam/src/utils/spmdutils private pair ! copy of routine in atm/cam/src/utils/spmdutils !------------------------------------------------------------------------------ ! type declaration for describing an arbitrary number of contiguous parcels ! this is for irregular communications !------------------------------------------------------------------------------ type blockdescriptor integer :: method ! transpose method integer :: type ! Ptr to MPI derived type integer, pointer :: displacements(:) ! Offsets in local segment integer, pointer :: blocksizes(:) ! Block sizes to transfer integer :: partneroffset ! Aggregated partner offset integer :: partnertype ! Ptr to partner's MPI derived type integer :: Nparcels ! size( displacements ) integer :: Tot_Size ! sum ( blocksizes ) end type blockdescriptor ! Transpose methods (method) ! 0 for contiguous temporary buffer ! 1 for direct communication (derived types) ! The variables immediately below refer specifically to mpi derived types INTEGER, ALLOCATABLE, SAVE :: InHandle(:, :) INTEGER, ALLOCATABLE, SAVE :: OutHandle(:, :) INTEGER, SAVE :: BegTrf = 0 ! Ongoing overlapped begintransfer # INTEGER, SAVE :: EndTrf = 0 ! Ongoing overlapped endtransfer # INTEGER, SAVE :: MaxTrf = 0 ! Max no. active Mp_sendirr derived type messages ! !PUBLIC DATA MEMBERS: integer, SAVE:: gid ! PE id integer(i4), SAVE:: masterpro = 0 ! Master process id integer(i4), SAVE:: numpro ! Permanent No. of PEs integer(i4), SAVE:: numcomm ! Local No. of PEs integer(i4), SAVE:: numcpu ! No. of threads integer, SAVE:: commglobal ! Global Communicator integer, SAVE:: Max_Nparcels = 0 ! Maximum number of parcels in ! single blockdescriptor !------------------------------------------------------------------------------ ! Local parameters !------------------------------------------------------------------------------ integer, parameter:: nbuf = 2 ! Max No. of sends per call ! mp_send4d_ns has two sends per call (full border regions to north and south) ! mp_send2_ns has four sends per call (2 directions and 2 variables); however, ! only one ghost latitude is sent, so nbuf=2 suffices as long as nghost ! is greater than 1. ! mp_send3d has one send per call (border region in one direction). ! mp_send3d_2 has two sends per call (2 variables, border region in one direction). integer, parameter:: nghost = 3 ! No. of ghost indices integer, parameter:: max_nq = 1 ! No. of tracers simultaneously ! border communicated; can be ! overridden with dynamic storage integer, parameter:: max_trac = PCNST ! No. of tracers integer, parameter:: max_call = 2 ! Max No. of back-to-back... ! ...mp_send calls ! Currently, CAM has at most two overlapping border communication calls ! The above variable is relevant for contiguous irregular communications integer, parameter:: idimsize = PLON*nghost*(PLEV+1)*max_nq ! Size of MPI buffer region ! in mp_send/mp_recv calls, used ! to determine offset in GA integer, parameter:: platg = PLAT + 2*nghost integer, parameter :: mp_r4 = MPI_REAL integer, parameter :: mp_r8 = MPI_DOUBLE_PRECISION integer, parameter :: mp_i4 = MPI_INTEGER !------------------------------------------------------------------------------ ! Local variables !------------------------------------------------------------------------------ integer, SAVE:: max_irr = 0 ! Max No. active Mp_sendirr calls with window integer ierror integer, SAVE:: sizet1, sizer8, sizer4, sizei4 ! CAM-specific variables integer, SAVE:: tracmax, tracbmax, dpvarmax, totvar integer, SAVE:: phys_transpose_mod integer, SAVE:: idimsizz integer, SAVE:: modcam_method, modcam_geopk, modcam_gatscat integer, SAVE:: modcam_npryz(4), modcam_tagoffset, modcam_maxirr integer, parameter :: phys_transpose_modmin = 11 integer, parameter :: phys_transpose_vars = 7 data phys_transpose_mod / -1 / data modcam_method / -1 / data modcam_geopk / -1 / data modcam_gatscat / -1 / data modcam_npryz / -1, -1, -1, -1 / data modcam_tagoffset / 0 / data modcam_maxirr / -1 / ! ! tracmax is the maximum number of tracers simultaneously transposed within dynamics (set to 1) ! (except in dynamics-physics transposes) ! tracbmax is the maximum number of tracers simultaneously border communicated ! dpvarmax is the number of variables communicated in dynamics-physics transposes ! totvar is the maximum number of variables simultaneously transposed ! phys_transpose_mod is the communication method for dynamics/physics transposes; admissable values ! are >= phys_transpose_modmin; it is communicated from CAM when such transposes ! are requested. ! phys_transpose_vars is the number of non-tracer variables transposed between dynamics and ! physics instantiations in CAM. ! modcam_method, modcam_geopk and modcam_gatscat correspond to mod_method, mod_geopk and ! mod_gatscat in CAM. ! modcam_npryz corresponds to npr_yz in CAM. ! modcam_maxirr corresonds to mod_maxirr in CAM. !------------------------------------------------------------------------------ ! Variables to control global array locations and window synchronization !------------------------------------------------------------------------------ integer win_count ! Counts No. of windows in use integer igosouth, igonorth ! Index of latitudinal send direction integer ifromsouth, ifromnorth ! Index of latitudinal recv direction !------------------------------------------------------------------------------ ! Local type declaration for mp_windows !------------------------------------------------------------------------------ type window integer :: id ! Window id integer :: size ! Size of global window (point based) integer :: ncall_s ! Count send calls on window integer :: ncall_r ! Count recv calls on window integer :: offset_s ! Starting position in GA send integer :: offset_r ! Starting position in GA recv integer :: dest ! For use with send calls integer :: src ! For use with recv calls integer :: size_r ! Size of incoming message integer :: nsend ! Send counter integer :: nrecv ! Receive post counter integer :: nread ! Receive confirm counter integer, pointer :: sqest(:) ! Send handle integer, pointer :: rqest(:) ! Receive handle end type window !------------------------------------------------------------------------------ ! Beginning Global Array variable declaration: !------------------------------------------------------------------------------ type (window) :: r8_win type (window) :: r4_win type (window) :: i4_win type (window) :: t1_win ! Upper bound on ratio of local to average storage over subdomains. ! This takes into account different sized subdomains. real*8, parameter :: alloc_slack_factor = 1.2d0 ! ! window variable declarations ! real(r8), allocatable, SAVE:: ga_t1_r(:) real(r8), allocatable, SAVE:: ga_t1_s(:) real(r8), allocatable, SAVE:: ga_r8_r(:) real(r8), allocatable, SAVE:: ga_r8_s(:) real(r4), allocatable, SAVE:: ga_r4_r(:) real(r4), allocatable, SAVE:: ga_r4_s(:) integer(i4), allocatable, SAVE:: ga_i4_r(:) integer(i4), allocatable, SAVE:: ga_i4_s(:) ! ! auxiliary variable declarations ! integer, SAVE:: Status(MPI_STATUS_SIZE) integer, allocatable, SAVE:: Stats(:) ! !EOP !------------------------------------------------------------------------------ contains !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_init --- Initialize SPMD parallel communication ! ! !INTERFACE: subroutine mp_init( comm, npryzxy, mod_method, mod_geopk, mod_gatscat, mod_maxirr ) 1,7 ! ! !INPUT PARAMETERS: integer, optional :: comm ! communicator integer, optional, intent(in) :: npryzxy(4) ! 2D decomposition integer, optional, intent(in) :: mod_method ! CAM optimization integer, optional, intent(in) :: mod_geopk ! CAM optimization integer, optional, intent(in) :: mod_gatscat ! CAM optimization integer, optional, intent(in) :: mod_maxirr ! CAM optimization ! !DESCRIPTION: ! ! Initialize SPMD parallel communication. It is recommended that ! COMM (main communicator) and NPRYZXY (2D decomposition) be set. ! ! Set the mod* variables only if you are acquainted with their ! meaning (default is 0). ! ! !REVISION HISTORY: ! 2001.09.01 Lin ! 2002.02.15 Putman Modified for Global Array code ! 2002.04.09 Putman Added ProTeX documentation ! 2002.08.06 Sawyer Added optional communicator input argument ! 2006.06.15 Sawyer Added CAM-dependent optional arguments ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer mysize integer using_window, vertical_lines, latitude_lines integer local_dynamic_storage, numpro_use real*8 geopkrat, one, ghostrat ! Initialize MPI; allow for general communicator if ( present(comm) ) then call mpi_start( comm ) else call mpi_start( MPI_COMM_WORLD ) endif ! Initialize OpenMP call omp_start ! ! Adopt 2D decomposition if provided. ! modcam_npryz = (/ 1,1,1,1 /) ! Default value (sequential) if ( present( npryzxy ) ) then modcam_npryz(1:4) = npryzxy(1:4) modcam_tagoffset = modcam_npryz(3) * modcam_npryz(4) endif if (gid .eq. 0) then write (iulog,*) 'MOD_COMM - modcam_npryz = ', & modcam_npryz(1), modcam_npryz(2), & modcam_npryz(3), modcam_npryz(4) write (iulog,*) 'MOD_COMM - modcam_tagoffset = ', modcam_tagoffset endif ! ! Set CAM optimization variables ! ! modcam_method refers to irregular communications for transposes ! modcam_geopk refers to irregular communications for the geopotential ! modcam_gatscat refers to irregular communications for gather/scatters ! For any of these, a value of 0 means source data will be gathered into a contiguous ! buffer (window), communicated to a contiguous buffer (window) in the target, and ! then scattered to its final destination; a value of 1 means MPI derived types will ! be used (hence not requiring window storage). ! modcam_maxirr refers to maximum number of irregular communications to be active at once modcam_method = 0 ! Default value modcam_geopk = 0 ! Default value modcam_gatscat = 0 ! Default value modcam_maxirr = 1 ! Default value if ( present( mod_method ) ) modcam_method = mod_method if ( present( mod_geopk ) ) modcam_geopk = mod_geopk if ( present( mod_gatscat ) ) modcam_gatscat = mod_gatscat if ( present( mod_maxirr ) ) modcam_maxirr = mod_maxirr if (gid .eq. 0) then write(iulog,*) 'MOD_COMM - modcam_method modcam_geopk modcam_gatscat modcam_maxirr = ', & modcam_method, modcam_geopk, modcam_gatscat, modcam_maxirr endif ! ! End CAM optimizations ! MaxTrf = modcam_maxirr max_irr = modcam_maxirr win_count = 0 ! !************************************************************************* ! local_dynamic_storage is set to 1 when window storage is based on locally dimensioned ! arrays, 0 otherwise; this occurs when modcam_gatscat equals 1, as it is only the ! gather/scatters that require global storage. !************************************************************************* ! local_dynamic_storage = 0 if (modcam_gatscat .eq. 1) local_dynamic_storage = 1 allocate( Stats(MAX(nbuf,numpro)*MAX(max_call,max_irr)*MPI_STATUS_SIZE) ) allocate( InHandle(numpro,MaxTrf) ) allocate( OutHandle(numpro,MaxTrf) ) idimsizz = idimsize if (local_dynamic_storage .eq. 1) then if (gid .eq. 0) write(iulog,*) 'Using local dynamic storage for mod_comm window' else if (gid .eq. 0) write(iulog,*) 'Using global dynamic storage for mod_comm window' endif ! ! Dynamically allocate target global arrays ! !************************************************************************* ! Compute additional storage due to ghost latitudes being included in some ! transposes. Allow 3 ghost points on each side. The required storage ! could be (6+L)/L times the original storage, where L is the number of ! latitude lines in the subdomain. Ghost points can also occur in the ! vertical due to edge quantities, but this would not occur simultaneously ! with ghost points in latitude; the extra storage due to vertical ghost ! points is not nearly as great as with latitude. !************************************************************************* using_window = 1 ! This is a local variable if (modcam_method .eq. 1) using_window = 0 one = real(1,r8) ghostrat = one if (using_window .eq. 1 .and. local_dynamic_storage .eq. 1) then latitude_lines = real(PLAT,r8)/real(modcam_npryz(1),r8) ghostrat = real(6+latitude_lines,r8)/real(latitude_lines,r8) endif if (gid .eq. 0) write(iulog,*) 'Mod_comm - ghostrat = ', ghostrat !************************************************************************* ! Compute extent to which required window storage for geopotential computation ! exceeds that of transpose - relevant only for local dynamic storage, ! since with global storage there will be enough space anyway; also, ! this applies only when using window; further, this applies only when ! the CAM variable geopktrans equals 1, though we do not test for that here. ! The geopotential calculation sends a latitude line to every other process ! either vertically above or below the given process; there can be ! at most modcam_npryz(2)-1 such target processes; compared to transposes ! (which send all vertical lines), the amount of data sent is expressed ! as the ratio geopkrat; our concern is making the window (whose size ! is computed based on transposes) large enough, so we must multiply its ! size by geopkrat; we never shrink the window, so geopkrat >= 1. !************************************************************************* using_window = 1 ! This is a local variable if (modcam_geopk .eq. 1) using_window = 0 one = real(1,r8) geopkrat = one if (using_window .eq. 1 .and. local_dynamic_storage .eq. 1) then vertical_lines = ceiling(real(PLEV,r8)/real(modcam_npryz(2),r8)) geopkrat = real(modcam_npryz(2)-1,r8)/real(vertical_lines,r8) geopkrat = max(geopkrat,one) endif if (gid .eq. 0) write(iulog,*) 'Mod_comm - geopkrat = ', geopkrat !************************************************************************* ! beginning of CAM totvar computation !************************************************************************* ! CAM contains two kinds of transposes. The most commonly referred to transposes ! refer to those which connect the xy and yz decompositions. Depending on ! the physics decomposition, CAM might additionally compute transposes between ! the dynamics and physics; this depends on the variable phys_loadbalance. ! Furthermore, these transposes might or might not be computed using mod_comm. ! The former transposes are generally performed one variable at a time; the ! latter transposes combine all variables to be transposed, including the ! full complement of tracers. The maximum number of variables to be ! simultaneously subject to irregular communications is dependent on ! whether or not mod_comm is used to compute dynamics-physics transposes ! and could depend on the number of tracers. ! Compute maximum number of variables to be simultaneously subject ! to irregular communications (e.g., transposed variables based on CAM) ! and store in the variable 'totvar'. ! Tracmax is the number of tracers simultaneously transposed within dynamics; ! Tracbmax is the number of tracers simultaneously border comunicated within trac2d; ! both of these are currently hardwired to 1. tracmax = 1 tracbmax = 1 totvar = tracmax ! Now consider dynamics-physics transposes in CAM dp_coupling (dpvarmax) ! If phys_transpose_mod is still -1, that means it has not been updated ! by CAM and hence mod_comm will not be used for dynamics-physics transposes. ! (NOTE: phys_transpose_mod is computed in phys_grid_setopts in phys_grid.F90.) ! Also note that the logic involving phys_transpose_mod and phys_transpose_modmin ! must remain consistent with the coding in phys_grid.F90. Additionally, ! phys_transpose_vars must remain consistent with the coding in dp_coupling.F90. ! (See above declaration and initialization for CAM-specific variables.) ! (begin dpvarmax calculation) if (phys_transpose_mod .eq. -1) then if (gid .eq. 0) write(iulog,*) & '(MOD_COMM) - mod_comm not being used for dynamcis-physics transposes' dpvarmax = 0 ! ! If phys_transpose_mod is >= phys_transpose_modmin, that is a signal that mod_comm is to be used ! for dynamics/physics transposes in CAM. In that case, one must allocate enough window ! storage for those transposes. Presently, the number of such simultaneously transposed ! variables equals phys_transpose_vars plus the number of constituents. ! elseif (phys_transpose_mod .ge. phys_transpose_modmin) then dpvarmax = phys_transpose_vars + max_trac else dpvarmax = 0 endif ! (end dpvarmax calculation) ! totvar is the maximum of (1) the number of tracers to be simultaneously transposed ! within the dynamics, and (2) the number of variables to be transposed between ! dynamics and physics instantiations in CAM totvar = max(totvar, dpvarmax) !************************************************************************* ! end of CAM totvar computation !************************************************************************* if (gid .eq. 0) write(iulog,*) 'Mod_comm - tracmax dpvarmax totvar tracbmax = ', & tracmax, dpvarmax, totvar, tracbmax idimsizz = (idimsize/max_nq)*tracbmax sizet1 = idimsizz*nbuf*max_call ! Adjust window sizes for geopotential and/or ghost points sizer8 = PLON*platg*(PLEV+1)*totvar*max(geopkrat,ghostrat)*max_irr sizer4 = PLON*platg*(PLEV+1)*totvar*max(geopkrat,ghostrat)*max_irr sizei4 = PLON*PLAT*PLEV*max_irr ! Compute local storage requirement for irregular communications by dividing ! global requirement by the number of tasks. Allow slack factor to account ! for nonuniformity of decomposition and ghost zones. Not valid for global ! operations such as gathers and scatters when local windows are used. if (local_dynamic_storage .eq. 1) then numpro_use = modcam_npryz(1) * modcam_npryz(2) sizer8 = ceiling( alloc_slack_factor*real(sizer8,r8)/real(numpro_use,r8) ) sizer4 = ceiling( alloc_slack_factor*real(sizer4,r8)/real(numpro_use,r8) ) ! The only i4 irregular communications in CAM occur in io_dist. sizei4 = 1 endif # if defined ( NOR4 ) sizer4 = 1 if (gid .eq. 0) write(iulog,*) 'Mod_comm - r4 windows disabled' # endif using_window = 1 ! This is a local variable if (modcam_method .eq. 1 .and. modcam_geopk .eq. 1) using_window = 0 if (using_window .eq. 0) then if (gid .eq. 0) write(iulog,*) 'Mod_comm - r8 and r4 windows set to trivial size' sizer8 = 1 sizer4 = 1 endif ! Allocate global storage allocate( ga_t1_r(sizet1) ) allocate( ga_t1_s(sizet1) ) allocate( ga_r8_r(sizer8) ) allocate( ga_r8_s(sizer8) ) allocate( ga_r4_r(sizer4) ) allocate( ga_r4_s(sizer4) ) allocate( ga_i4_r(sizei4) ) allocate( ga_i4_s(sizei4) ) ! Initialize windows mysize = sizet1 call win_init_r8(comm, t1_win, ga_t1_r, mysize) if (gid .eq. 0) write(iulog,*) 'Mod_comm t1_win window size = ', mysize mysize = sizer8 call win_init_r8(comm, r8_win, ga_r8_r, mysize) if (gid .eq. 0) write(iulog,*) 'Mod_comm r8_win window size = ', mysize mysize = sizer4 call win_init_r4(comm, r4_win, ga_r4_r, mysize) if (gid .eq. 0) write(iulog,*) 'Mod_comm r4_win window size = ', mysize mysize = sizei4 call win_init_i4(comm, i4_win, ga_i4_r, mysize) if (gid .eq. 0) write(iulog,*) 'Mod_comm i4_win window size = ', mysize igosouth = 0 igonorth = 1 ifromsouth = 1 ifromnorth = 0 !EOC end subroutine mp_init !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_exit --- End SPMD parallel communication ! ! !INTERFACE: subroutine mp_exit( comm ) 1 ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator ! !DESCRIPTION: ! ! End SPMD parallel communication ! ! !REVISION HISTORY: ! 2001.09.01 Lin ! 2002.02.15 Putman Modified for Global Array code ! 2002.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC call MPI_FINALIZE (ierror) return !EOC end subroutine mp_exit !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: omp_start --- Start openMP parallelism ! ! !INTERFACE: subroutine omp_start 1 ! !DESCRIPTION: ! ! Start openMP parallelism ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer ios, n, nowpro, nowcpu ! Compute number of OpenMP threads #if defined(_OPENMP) integer omp_get_num_threads !$omp parallel numcpu = omp_get_num_threads() !$omp end parallel #else numcpu = 1 #endif !EOC end subroutine omp_start !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mpi_start --- Start MPI parallelism ! ! !INTERFACE: subroutine mpi_start( comm ) 2 ! !INPUT PARAMETERS: integer :: comm ! communicator ! !DESCRIPTION: ! ! Start MPI parallelism ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! 02.08.06 Sawyer Added communicator input arguments ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: logical flag integer npthreads call MPI_INITIALIZED( flag, ierror ) if ( .not. flag ) then call MPI_INIT( ierror ) comm = MPI_COMM_WORLD endif call MPI_COMM_RANK (comm, gid, ierror) call MPI_COMM_SIZE (comm, numpro, ierror) call MPI_COMM_DUP (comm, commglobal, ierror) !EOC end subroutine mpi_start !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: win_init_r8 --- Initialize real*8 communication window ! ! !INTERFACE: subroutine win_init_r8(comm, win, ga, isize) 2 ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator integer, intent(in) :: isize real(r8), intent(in) :: ga(isize) ! !OUTPUT PARAMETERS: type (window), intent(inout) :: win ! !DESCRIPTION: ! ! Initialize real*8 communication window ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: win_count = win_count + 1 win%id = win_count win%size = isize win%ncall_s = 0 win%ncall_r = 0 win%nsend = 0 win%nrecv = 0 win%nread = 0 allocate( win%sqest(MAX(nbuf,numpro)*MAX(max_call,max_irr)) ) allocate( win%rqest(MAX(nbuf,numpro)*MAX(max_call,max_irr)) ) !EOC end subroutine win_init_r8 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: win_init_r4 --- Initialize real*4 communication window ! ! !INTERFACE: subroutine win_init_r4(comm, win, ga, isize) 1 ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator integer, intent(in) :: isize real(r4), intent(in) :: ga(isize) ! !OUTPUT PARAMETERS: type (window), intent(inout) :: win ! !DESCRIPTION: ! ! Initialize real*4 communication window ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: win_count = win_count + 1 win%id = win_count win%size = isize win%ncall_s = 0 win%ncall_r = 0 win%nsend = 0 win%nrecv = 0 win%nread = 0 allocate( win%sqest(MAX(nbuf,numpro)*MAX(max_call,max_irr)) ) allocate( win%rqest(MAX(nbuf,numpro)*MAX(max_call,max_irr)) ) !EOC end subroutine win_init_r4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: win_init_i4 --- Initialize integer*4 communication window ! ! !INTERFACE: subroutine win_init_i4(comm, win, ga, isize) 1 ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator integer, intent(in) :: isize integer(i4), intent(in) :: ga(isize) ! !OUTPUT PARAMETERS: type (window), intent(inout) :: win ! !DESCRIPTION: ! ! Initialize integer*4 communication window ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: win_count = win_count + 1 win%id = win_count win%size = isize win%ncall_s = 0 win%ncall_r = 0 win%nsend = 0 win%nrecv = 0 win%nread = 0 allocate( win%sqest(MAX(nbuf,numpro)*MAX(max_call,max_irr)) ) allocate( win%rqest(MAX(nbuf,numpro)*MAX(max_call,max_irr)) ) !EOC end subroutine win_init_i4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_send4d_ns --- Send 4d north/south ghost latitudes (real*8) ! ! !INTERFACE: subroutine mp_send4d_ns(comm, im, jm, km, nq, jfirst, jlast, kfirst, klast, & 14,5 ng_s, ng_n, q) ! ! !INPUT PARAMETERS: integer, intent(in):: comm ! communicator integer, intent(in):: im, jm, km, nq integer, intent(in):: jfirst, jlast integer, intent(in):: kfirst, klast integer, intent(in):: ng_s ! southern zones to ghost integer, intent(in):: ng_n ! northern zones to ghost real(r8), intent(in):: q(im,jfirst-ng_s:jlast+ng_n,kfirst:klast,nq) ! ! !DESCRIPTION: ! ! Send 4d north/south ghost latitudes ! ! !REVISION HISTORY: ! 2001.09.01 Lin ! 2002.02.15 Putman Modified for Global Arrays code ! 2002.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer :: gidu #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_RANK (comm, gidu, ierror) call Win_Open(comm, t1_win) ! Send to south if ( jfirst > 1 ) then t1_win%src = gidu - 1 t1_win%offset_r = ifromsouth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf t1_win%size_r = im*ng_s*(klast-kfirst+1)*nq call Ga_RecvInit_r8(comm, t1_win, ga_t1_r) t1_win%dest = gidu - 1 t1_win%offset_s = igosouth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf call Ga_Put4d_r8(comm, q, t1_win, im, jm, km, nq, & 1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, & 1, im, jfirst, jfirst+ng_n-1, kfirst, klast, 1, nq, & ga_t1_s, ga_t1_r ) endif ! Send to north if ( jlast < jm ) then t1_win%src = gidu + 1 t1_win%offset_r = ifromnorth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf t1_win%size_r = im*ng_n*(klast-kfirst+1)*nq call Ga_RecvInit_r8(comm, t1_win, ga_t1_r) t1_win%dest = gidu + 1 t1_win%offset_s = igonorth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf call Ga_Put4d_r8(comm, q, t1_win, im, jm, km, nq, & 1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, & 1, im, jlast-ng_s+1, jlast, kfirst, klast, 1, nq, & ga_t1_s, ga_t1_r ) endif #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_send4d_ns !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_recv4d_ns --- Receive 4d north/south ghost latitudes (real*8) ! ! !INTERFACE: subroutine mp_recv4d_ns(comm, im, jm, km, nq, jfirst, jlast, kfirst, klast, & 13,4 ng_s, ng_n, q) ! ! !INPUT PARAMETERS: integer, intent(in):: comm ! communicator integer, intent(in):: im, jm, km, nq integer, intent(in):: jfirst, jlast integer, intent(in):: kfirst, klast integer, intent(in):: ng_s ! southern zones to ghost integer, intent(in):: ng_n ! northern zones to ghost ! !OUTPUT PARAMETERS: real(r8), intent(inout):: q(im,jfirst-ng_s:jlast+ng_n,kfirst:klast,nq) ! ! !DESCRIPTION: ! ! Receive 4d north/south ghost latitudes ! ! !REVISION HISTORY: ! 2001.09.01 Lin ! 2002.02.15 Putman Modified for Global Arrays code ! 2002.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer :: gidu #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_RANK (comm, gidu, ierror) call Win_Close(comm, t1_win) ! Recv from south if ( jfirst > 1 ) then t1_win%src = gidu-1 t1_win%offset_r = ifromsouth*idimsizz + (t1_win%ncall_r-1)*idimsizz*nbuf call Ga_Get4d_r8(comm, q, t1_win, im, jm, km, nq, & 1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, & 1, im, jfirst-ng_s, jfirst-1, kfirst, klast, 1, nq, & ga_t1_r ) endif ! Recv from north if ( jlast < jm ) then t1_win%src = gidu+1 t1_win%offset_r = ifromnorth*idimsizz + (t1_win%ncall_r-1)*idimsizz*nbuf call Ga_Get4d_r8(comm, q, t1_win, im, jm, km, nq, & 1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, & 1, im, jlast+1, jlast+ng_n, kfirst, klast, 1, nq, & ga_t1_r ) endif call Win_Finalize(comm, t1_win) #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_recv4d_ns !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_send4d_ns_r4 --- Send 4d north/south ghost latitudes (real*4) ! ! !INTERFACE: subroutine mp_send4d_ns_r4(comm, im, jm, km, nq, jfirst, jlast, kfirst, klast, &,5 ng_s, ng_n, q) ! ! !INPUT PARAMETERS: integer, intent(in):: comm ! communicator integer, intent(in):: im, jm, km, nq integer, intent(in):: jfirst, jlast integer, intent(in):: kfirst, klast integer, intent(in):: ng_s ! southern zones to ghost integer, intent(in):: ng_n ! northern zones to ghost real(r4), intent(in):: q(im,jfirst-ng_s:jlast+ng_n,kfirst:klast,nq) ! ! !DESCRIPTION: ! ! Send 4d north/south ghost latitudes ! ! !REVISION HISTORY: ! 2005.03.20 Sawyer Creation from mp_send4d_ns ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer :: gidu #if defined ( NOR4 ) write(iulog,*) 'Mod_comm: mp_send4d_ns_r4 - r4 windows disabled - exiting' call exit(1) #endif #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_RANK (comm, gidu, ierror) call Win_Open(comm, r4_win) ! Send to south if ( jfirst > 1 ) then r4_win%src = gidu - 1 r4_win%offset_r = ifromsouth*idimsizz + (r4_win%ncall_s-1)*idimsizz*nbuf r4_win%size_r = im*ng_s*(klast-kfirst+1)*nq call Ga_RecvInit_r4(comm, r4_win, ga_r4_r) r4_win%dest = gidu - 1 r4_win%offset_s = igosouth*idimsizz + (r4_win%ncall_s-1)*idimsizz*nbuf call Ga_Put4d_r4(comm, q, r4_win, im, jm, km, nq, & 1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, & 1, im, jfirst, jfirst+ng_n-1, kfirst, klast, 1, nq, & ga_r4_s, ga_r4_r ) endif ! Send to north if ( jlast < jm ) then r4_win%src = gidu + 1 r4_win%offset_r = ifromnorth*idimsizz + (r4_win%ncall_s-1)*idimsizz*nbuf r4_win%size_r = im*ng_n*(klast-kfirst+1)*nq call Ga_RecvInit_r4(comm, r4_win, ga_r4_r) r4_win%dest = gidu + 1 r4_win%offset_s = igonorth*idimsizz + (r4_win%ncall_s-1)*idimsizz*nbuf call Ga_Put4d_r4(comm, q, r4_win, im, jm, km, nq, & 1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, & 1, im, jlast-ng_s+1, jlast, kfirst, klast, 1, nq, & ga_r4_s, ga_r4_r ) endif #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_send4d_ns_r4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_recv4d_ns_r4 --- Receive 4d north/south ghost latitudes (real*4) ! ! !INTERFACE: subroutine mp_recv4d_ns_r4(comm, im, jm, km, nq, jfirst, jlast, kfirst, klast, &,4 ng_s, ng_n, q) ! ! !INPUT PARAMETERS: integer, intent(in):: comm ! communicator integer, intent(in):: im, jm, km, nq integer, intent(in):: jfirst, jlast integer, intent(in):: kfirst, klast integer, intent(in):: ng_s ! southern zones to ghost integer, intent(in):: ng_n ! northern zones to ghost ! !OUTPUT PARAMETERS: real(r4), intent(inout):: q(im,jfirst-ng_s:jlast+ng_n,kfirst:klast,nq) ! ! !DESCRIPTION: ! ! Receive 4d north/south ghost latitudes (real*4) ! ! !REVISION HISTORY: ! 2005.03.20 Sawyer Creation from mp_recv4d_ns ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer :: gidu #if defined ( NOR4 ) write(iulog,*) 'Mod_comm: mp_recv4d_ns_r4 - r4 windows disabled - exiting' call exit(1) #endif #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_RANK (comm, gidu, ierror) call Win_Close(comm, r4_win) ! Recv from south if ( jfirst > 1 ) then r4_win%src = gidu-1 r4_win%offset_r = ifromsouth*idimsizz + (r4_win%ncall_r-1)*idimsizz*nbuf call Ga_Get4d_r4(comm, q, r4_win, im, jm, km, nq, & 1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, & 1, im, jfirst-ng_s, jfirst-1, kfirst, klast, 1, nq, & ga_r4_r ) endif ! Recv from north if ( jlast < jm ) then r4_win%src = gidu+1 r4_win%offset_r = ifromnorth*idimsizz + (r4_win%ncall_r-1)*idimsizz*nbuf call Ga_Get4d_r4(comm, q, r4_win, im, jm, km, nq, & 1, im, jfirst-ng_s, jlast+ng_n, kfirst, klast, 1, nq, & 1, im, jlast+1, jlast+ng_n, kfirst, klast, 1, nq, & ga_r4_r ) endif call Win_Finalize(comm, r4_win) #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_recv4d_ns_r4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_send2_ns --- Send 2 variables north/south ghost latitudes ! ! !INTERFACE: subroutine mp_send2_ns(comm, im, jm, km, jfirst, jlast, kfirst, klast, & 1,9 nd, q1, q2) ! ! !INPUT PARAMETERS: integer, intent(in):: comm ! communicator integer, intent(in):: im, jm, km integer, intent(in):: jfirst, jlast integer, intent(in):: kfirst, klast !careful: klast might be klast+1 integer, intent(in):: nd real(r8), intent(in):: q1(im,jfirst-nd:jlast+nd,kfirst:klast) real(r8), intent(in):: q2(im,jfirst-nd:jlast+nd,kfirst:klast) ! ! !DESCRIPTION: ! ! Send 2 variables north/south ghost latitudes ! ! !REVISION HISTORY: ! 2001.09.01 Lin ! 2002.02.15 Putman Modified for Global Arrays code ! 2002.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer :: gidu #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_RANK (comm, gidu, ierror) call Win_Open(comm, t1_win) ! Send to south if ( jfirst > 1 ) then t1_win%src = gidu - 1 t1_win%size_r = im*(klast-kfirst+1) t1_win%offset_r = ifromsouth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf call Ga_RecvInit_r8(comm, t1_win, ga_t1_r) t1_win%offset_r = t1_win%offset_r + im*(klast-kfirst+1) call Ga_RecvInit_r8(comm, t1_win, ga_t1_r) t1_win%dest = gidu - 1 t1_win%offset_s = igosouth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf call Ga_Put4d_r8( comm, q1, t1_win, im, jm, km, 2, & 1, im, jfirst-nd, jlast+nd, kfirst, klast, 1, 1, & 1, im, jfirst, jfirst, kfirst, klast, 1, 1, & ga_t1_s, ga_t1_r ) t1_win%offset_s = t1_win%offset_s + im*(klast-kfirst+1) call Ga_Put4d_r8( comm, q2, t1_win, im, jm, km, 2, & 1, im, jfirst-nd, jlast+nd, kfirst, klast, 2, 2, & 1, im, jfirst, jfirst, kfirst, klast, 2, 2, & ga_t1_s, ga_t1_r ) endif ! Send to north if ( jlast < jm ) then t1_win%src = gidu + 1 t1_win%size_r = im*(klast-kfirst+1) t1_win%offset_r = ifromnorth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf call Ga_RecvInit_r8(comm, t1_win, ga_t1_r) t1_win%offset_r = t1_win%offset_r + im*(klast-kfirst+1) call Ga_RecvInit_r8(comm, t1_win, ga_t1_r) t1_win%dest = gidu + 1 t1_win%offset_s = igonorth*idimsizz + (t1_win%ncall_s-1)*idimsizz*nbuf call Ga_Put4d_r8( comm, q1, t1_win, im, jm, km, 2, & 1, im, jfirst-nd, jlast+nd, kfirst, klast, 1, 1, & 1, im, jlast, jlast, kfirst, klast, 1, 1, & ga_t1_s, ga_t1_r ) t1_win%offset_s = t1_win%offset_s + im*(klast-kfirst+1) call Ga_Put4d_r8( comm, q2, t1_win, im, jm, km, 2, & 1, im, jfirst-nd, jlast+nd, kfirst, klast, 2, 2, & 1, im, jlast, jlast, kfirst, klast, 2, 2, & ga_t1_s, ga_t1_r ) endif #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_send2_ns !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_recv2_ns --- Receive 2 variables north/south ghost latitudes ! ! !INTERFACE: subroutine mp_recv2_ns(comm, im, jm, km, jfirst, jlast, kfirst, klast, & 1,6 nd, q1, q2) ! ! !INPUT PARAMETERS: integer, intent(in):: comm ! communicator integer, intent(in):: im, jm, km integer, intent(in):: jfirst, jlast integer, intent(in):: kfirst, klast !careful: klast might be klast+1 integer, intent(in):: nd ! !OUTPUT PARAMETERS: real(r8), intent(inout):: q1(im,jfirst-nd:jlast+nd,kfirst:klast) real(r8), intent(inout):: q2(im,jfirst-nd:jlast+nd,kfirst:klast) ! ! !DESCRIPTION: ! ! Receive 2 variables north/south ghost latitudes ! ! !REVISION HISTORY: ! 2001.09.01 Lin ! 2002.02.15 Putman Modified for Global Arrays code ! 2002.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! !LOCAL VARIABLES: integer j integer :: gidu #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_RANK (comm, gidu, ierror) call Win_Close(comm, t1_win) ! Recv from south if ( jfirst > 1 ) then j = jfirst - 1 t1_win%src = gidu - 1 t1_win%offset_r = ifromsouth*idimsizz + (t1_win%ncall_r-1)*idimsizz*nbuf call Ga_Get4d_r8( comm, q1, t1_win, im, jm, km, 2, & 1, im, jfirst-nd, jlast+nd, kfirst, klast, 1, 1, & 1, im, j, j, kfirst, klast, 1, 1, & ga_t1_r ) t1_win%offset_r = t1_win%offset_r + im*(klast-kfirst+1) call Ga_Get4d_r8( comm, q2, t1_win, im, jm, km, 2, & 1, im, jfirst-nd, jlast+nd, kfirst, klast, 2, 2, & 1, im, j, j, kfirst, klast, 2, 2, & ga_t1_r ) endif ! Recv from north if ( jlast < jm ) then j = jlast + 1 t1_win%src = gidu + 1 t1_win%offset_r = ifromnorth*idimsizz + (t1_win%ncall_r-1)*idimsizz*nbuf call Ga_Get4d_r8( comm, q1, t1_win, im, jm, km, 2, & 1, im, jfirst-nd, jlast+nd, kfirst, klast, 1, 1, & 1, im, j, j, kfirst, klast, 1, 1, & ga_t1_r ) t1_win%offset_r = t1_win%offset_r + im*(klast-kfirst+1) call Ga_Get4d_r8( comm, q2, t1_win, im, jm, km, 2, & 1, im, jfirst-nd, jlast+nd, kfirst, klast, 2, 2, & 1, im, j, j, kfirst, klast, 2, 2, & ga_t1_r ) endif call Win_Finalize(comm, t1_win) #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_recv2_ns !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_send3d --- Send ghost region ! ! !INTERFACE: subroutine mp_send3d(comm, dest, src, im, jm, km, if, il, jf, jl, kf, kl, & 31,3 i1, i2, j1, j2, k1, k2, q) ! ! !INPUT PARAMETERS: integer, intent(in):: comm ! communicator integer, intent(in):: dest, src integer, intent(in):: im, jm, km integer, intent(in):: if, il, jf, jl, kf, kl integer, intent(in):: i1, i2, j1, j2, k1, k2 real(r8), intent(in):: q(if:il, jf:jl, kf:kl) ! ! !DESCRIPTION: ! ! Send a general 3d real*8 ghost region ! ! !REVISION HISTORY: ! 02.04.15 Putman ! !EOP !------------------------------------------------------------------------------ !BOC #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_SIZE (comm, numcomm, ierror) call Win_Open(comm, t1_win) ! Init Recv src if ( src >= 0 .and. src < numcomm ) then ! is PE in valid range? t1_win%src = src t1_win%size_r = (i2-i1+1)*(j2-j1+1)*(k2-k1+1) ! chunk size t1_win%offset_r = (t1_win%ncall_s-1)*idimsizz*nbuf call Ga_RecvInit_r8(comm, t1_win, ga_t1_r) endif ! Send ghost region if ( dest >= 0 .and. dest < numcomm ) then t1_win%dest = dest t1_win%offset_s = (t1_win%ncall_s-1)*idimsizz*nbuf call Ga_Put4d_r8( comm, q, t1_win, im, jm, km, 1, & if, il, jf, jl, kf, kl, 1, 1, & i1, i2, j1, j2, k1, k2, 1, 1, ga_t1_s, ga_t1_r ) endif #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_send3d !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_recv3d --- Recv ghost region ! ! !INTERFACE: subroutine mp_recv3d(comm, src, im, jm, km, if, il, jf, jl, kf, kl, & 31,3 i1, i2, j1, j2, k1, k2, qout) ! ! !INPUT PARAMETERS: integer, intent(in):: comm ! communicator integer, intent(in):: src integer, intent(in):: im, jm, km integer, intent(in):: if, il, jf, jl, kf, kl integer, intent(in):: i1, i2, j1, j2, k1, k2 ! !OUTPUT PARAMETERS: real(r8), intent(inout):: qout(if:il, jf:jl, kf:kl) ! ! !DESCRIPTION: ! ! Recv a general 3d real*8 ghost region ! ! !REVISION HISTORY: ! 02.04.15 Putman ! !EOP !------------------------------------------------------------------------------ !BOC #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_SIZE (comm, numcomm, ierror) call Win_Close(comm, t1_win) ! Recv from src if ( src >= 0 .and. src < numcomm ) then ! is PE in valid range? t1_win%src = src t1_win%offset_r = (t1_win%ncall_r-1)*idimsizz*nbuf call Ga_Get4d_r8( comm, qout, t1_win, im, jm, km, 1, & if, il, jf, jl, kf, kl, 1, 1, & i1, i2, j1, j2, k1, k2, 1, 1, ga_t1_r ) endif call Win_Finalize(comm, t1_win) #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_recv3d !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_send3d_2 --- Send 2 ghost regions ! ! !INTERFACE: subroutine mp_send3d_2(comm, dest, src, im, jm, km, if, il, jf, jl, kf, kl, & 2,5 i1, i2, j1, j2, k1, k2, q1, q2) ! ! !INPUT PARAMETERS: integer, intent(in):: comm ! communicator integer, intent(in):: dest, src integer, intent(in):: im, jm, km integer, intent(in):: if, il, jf, jl, kf, kl integer, intent(in):: i1, i2, j1, j2, k1, k2 real(r8), intent(in):: q1(if:il, jf:jl, kf:kl) real(r8), intent(in):: q2(if:il, jf:jl, kf:kl) ! ! !DESCRIPTION: ! ! Send two general 3d real*8 ghost region ! ! !REVISION HISTORY: ! 02.04.15 Putman ! !EOP !------------------------------------------------------------------------------ !BOC #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_SIZE (comm, numcomm, ierror) call Win_Open(comm, t1_win) ! Init Recv src if ( src >= 0 .and. src < numcomm ) then ! is PE in valid range? t1_win%src = src t1_win%size_r = (i2-i1+1)*(j2-j1+1)*(k2-k1+1) ! chunk size t1_win%offset_r = (t1_win%ncall_s-1)*idimsizz*nbuf call Ga_RecvInit_r8(comm, t1_win, ga_t1_r) t1_win%offset_r = t1_win%offset_r + t1_win%size_r call Ga_RecvInit_r8(comm, t1_win, ga_t1_r) endif ! Send ghost region if ( dest >= 0 .and. dest < numcomm ) then t1_win%dest = dest t1_win%offset_s = (t1_win%ncall_s-1)*idimsizz*nbuf call Ga_Put4d_r8( comm, q1, t1_win, im, jm, km, 2, & if, il, jf, jl, kf, kl, 1, 1, & i1, i2, j1, j2, k1, k2, 1, 1, ga_t1_s, ga_t1_r ) t1_win%offset_s = t1_win%offset_s + (i2-i1+1)*(j2-j1+1)*(k2-k1+1) call Ga_Put4d_r8( comm, q2, t1_win, im, jm, km, 2, & if, il, jf, jl, kf, kl, 2, 2, & i1, i2, j1, j2, k1, k2, 2, 2, ga_t1_s, ga_t1_r ) endif #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_send3d_2 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_recv3d_2 --- Recv 2 ghost regions ! ! !INTERFACE: subroutine mp_recv3d_2(comm, src, im, jm, km, if, il, jf, jl, kf, kl, & 2,4 i1, i2, j1, j2, k1, k2, qout1, qout2) ! ! !INPUT PARAMETERS: integer, intent(in):: comm ! communicator integer, intent(in):: src integer, intent(in):: im, jm, km integer, intent(in):: if, il, jf, jl, kf, kl integer, intent(in):: i1, i2, j1, j2, k1, k2 ! !OUTPUT PARAMETERS: real(r8), intent(inout):: qout1(if:il, jf:jl, kf:kl) real(r8), intent(inout):: qout2(if:il, jf:jl, kf:kl) ! ! !DESCRIPTION: ! ! Recv two general 3d real*8 ghost regions ! ! !REVISION HISTORY: ! 02.04.15 Putman ! !EOP !------------------------------------------------------------------------------ !BOC #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_SIZE (comm, numcomm, ierror) call Win_Close(comm, t1_win) ! Recv from src if ( src >= 0 .and. src < numcomm ) then ! is PE in valid range? t1_win%src = src t1_win%offset_r = (t1_win%ncall_r-1)*idimsizz*nbuf call Ga_Get4d_r8( comm, qout1, t1_win, im, jm, km, 2, & if, il, jf, jl, kf, kl, 1, 1, & i1, i2, j1, j2, k1, k2, 1, 1, ga_t1_r ) t1_win%offset_r = t1_win%offset_r + (i2-i1+1)*(j2-j1+1)*(k2-k1+1) call Ga_Get4d_r8( comm, qout2, t1_win, im, jm, km, 2, & if, il, jf, jl, kf, kl, 2, 2, & i1, i2, j1, j2, k1, k2, 2, 2, ga_t1_r ) endif call Win_Finalize(comm, t1_win) #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_recv3d_2 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_barrier --- Synchronize all SPMD processes ! ! !INTERFACE: subroutine mp_barrier (comm) 3 ! ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator ! !DESCRIPTION: ! ! Synchronize all SPMD processes ! ! !REVISION HISTORY: ! 2001.09.01 Lin ! 2002.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC call MPI_BARRIER(comm, ierror) !EOC end subroutine mp_barrier !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Win_Open --- Open a communication window ! ! !INTERFACE: subroutine Win_Open(comm, win) 5 ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator ! !OUTPUT PARAMETERS: type(window), intent(inout):: win ! ! !DESCRIPTION: ! ! Begin a communication epoch, by opening a comm window. ! Update number of send calls on the window (win%ncall_s). ! Barrier synchronzize if necessary. ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC win%ncall_s = win%ncall_s + 1 !EOC end subroutine Win_Open !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Win_Close --- Close a communication window ! ! !INTERFACE: subroutine Win_Close(comm, win) 5 ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator ! !OUTPUT PARAMETERS: type(window), intent(inout):: win ! ! !DESCRIPTION: ! ! End a communication epoch, by closing a comm window. ! Update number of receive calls on the window (win%ncall_r). ! Barrier synchronzize if necessary. ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC win%ncall_r = win%ncall_r + 1 !EOC end subroutine Win_Close !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Win_Finalize --- Reset a communication window after a comm epoch. ! ! !INTERFACE: subroutine Win_Finalize(comm, win) 5 ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator ! !OUTPUT PARAMETERS: type(window), intent(inout):: win ! ! !DESCRIPTION: ! ! Complete a communication epoch and reset a comm window. ! Barrier synchronzize if necessary. ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC if (win%ncall_s == win%ncall_r) then call MPI_WAITALL(win%nsend, win%sqest, Stats, ierror) win%nsend = 0 win%nrecv = 0 win%nread = 0 win%ncall_s = 0 win%ncall_r = 0 endif !EOC end subroutine Win_Finalize !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_Put4d_r8 --- Write to real*8 4d global array ! ! !INTERFACE: subroutine Ga_Put4d_r8 ( comm, q, win, im, jm, km, nq, & 9 ifrom, ito, jfrom, jto, kfrom, kto, & nqfrom, nqto, i1, i2, j1, j2, k1, k2, & nq1, nq2, ga_s, ga_r ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(window), intent(inout) :: win ! Global Array Window integer, intent(in) :: im, jm, km, nq integer, intent(in) :: ifrom, ito, jfrom, jto, kfrom, kto, nqfrom, nqto real(r8), intent(in) :: q(ifrom:ito,jfrom:jto,kfrom:kto,nqfrom:nqto) integer, intent(in) :: i1, i2, j1, j2, k1, k2, nq1, nq2 ! !OUTPUT PARAMETERS: real(r8), intent(inout):: ga_s(win%size) real(r8), intent(inout):: ga_r(win%size) ! ! !DESCRIPTION: ! ! Write to real*8 4d global array. ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer i, j, k, iq, inc, inc1 integer i_length, j_length, k_length, ij_length, ijk_length integer send_tag, qsize integer :: gidu call MPI_COMM_RANK (comm, gidu, ierror) i_length = i2-i1+1 j_length = j2-j1+1 k_length = k2-k1+1 ij_length = i_length*j_length ijk_length = i_length*j_length*k_length ! Begin Non-Blocking Sends do iq = nq1, nq2 !$omp parallel do private(i,j,k,inc,inc1) do k = k1, k2 inc1 = (win%offset_s) + ((iq-nq1)*ijk_length) & + ((k-k1)*ij_length) -i1+1 do j = j1, j2 inc = inc1 + (j-j1)*i_length do i = i1, i2 ga_s(inc+i) = q(i,j,k,iq) enddo enddo enddo enddo qsize = (i2-i1+1)*(j2-j1+1)*(k2-k1+1)*(nq2-nq1+1) send_tag = gidu win%nsend = win%nsend + 1 call MPI_ISEND(ga_s(win%offset_s+1), qsize, mp_r8, win%dest, & send_tag, comm, win%sqest(win%nsend), ierror) !EOC end subroutine Ga_Put4d_r8 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_RecvInit_r8 --- Initiate real*8 Non-Blocking receive ! ! !INTERFACE: subroutine Ga_RecvInit_r8( comm, win, ga ) 9 ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(window), intent(inout) :: win ! Global Array Window ! !OUTPUT PARAMETERS: real(r8), intent(inout):: ga(win%size) ! ! !DESCRIPTION: ! ! Initiate real*8 Non-Blocking receive ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! 03.06.06 Sawyer Added else clause ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer qsize, recv_tag if (win%size >= win%offset_r + win%size_r) then recv_tag = win%src qsize = win%size_r win%nrecv = win%nrecv + 1 call MPI_IRECV(ga(win%offset_r+1), qsize, mp_r8, win%src, & recv_tag, comm, win%rqest(win%nrecv), ierror) else write(iulog,*) "Fatal ga_recvinit_r8: receive window out of space - exiting" write(iulog,*) 'gid win%size win%offset_r win%size_r = ', gid, & win%size, win%offset_r, win%size_r call exit(1) endif !EOC end subroutine Ga_RecvInit_r8 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_Get4d_r8 --- Read from real*8 4d global array ! ! !INTERFACE: subroutine Ga_Get4d_r8 ( comm, q, win, im, jm, km, nq, & 9 ifrom, ito, jfrom, jto, kfrom, kto, & nqfrom, nqto, i1, i2, j1, j2, k1, k2, & nq1, nq2, ga ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(window), intent(inout) :: win ! Global Array Window integer, intent(in) :: im, jm, km, nq integer, intent(in) :: i1, i2, j1, j2, k1, k2, nq1, nq2 integer, intent(in) :: ifrom, ito, jfrom, jto, kfrom, kto, nqfrom, nqto real(r8), intent(in) :: ga(win%size) ! !OUTPUT PARAMETERS: real(r8), intent(inout) :: q(ifrom:ito, jfrom:jto, kfrom:kto, nqfrom:nqto) ! ! !DESCRIPTION: ! ! Read from real*8 4d global array. ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer i, j, k, iq, inc, inc1 integer i_length, j_length, k_length, ij_length, ijk_length win%nread = win%nread + 1 call MPI_WAIT(win%rqest(win%nread), Status, ierror) i_length = i2-i1+1 j_length = j2-j1+1 k_length = k2-k1+1 ij_length = i_length*j_length ijk_length = i_length*j_length*k_length do iq = nq1, nq2 !$omp parallel do private(i,j,k,inc,inc1) do k = k1, k2 inc1 = (win%offset_r) + ((iq-nq1)*ijk_length) & + ((k-k1)*ij_length) -i1+1 do j = j1, j2 inc = inc1 + (j-j1)*i_length do i = i1, i2 q(i,j,k,iq) = ga(inc+i) enddo enddo enddo enddo !EOC end subroutine Ga_Get4d_r8 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_Put4d_r4 --- Write to real*4 4d global array ! ! !INTERFACE: subroutine Ga_Put4d_r4 ( comm, q, win, im, jm, km, nq, & 2 ifrom, ito, jfrom, jto, kfrom, kto, & nqfrom, nqto, i1, i2, j1, j2, k1, k2, & nq1, nq2, ga_s, ga_r ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(window), intent(inout) :: win ! Global Array Window integer, intent(in) :: im, jm, km, nq integer, intent(in) :: ifrom, ito, jfrom, jto, kfrom, kto, nqfrom, nqto real(r4), intent(in) :: q(ifrom:ito,jfrom:jto,kfrom:kto,nqfrom:nqto) integer, intent(in) :: i1, i2, j1, j2, k1, k2, nq1, nq2 ! !OUTPUT PARAMETERS: real(r4), intent(inout):: ga_s(win%size) real(r4), intent(inout):: ga_r(win%size) ! ! !DESCRIPTION: ! ! Write to real*4 4d global array. ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer i, j, k, iq, inc, inc1 integer i_length, j_length, k_length, ij_length, ijk_length integer send_tag, qsize integer :: gidu call MPI_COMM_RANK (comm, gidu, ierror) #if defined ( NOR4 ) write(iulog,*) 'Mod_comm: Ga_Put4d_r4 - r4 windows disabled - exiting' call exit(1) #endif i_length = i2-i1+1 j_length = j2-j1+1 k_length = k2-k1+1 ij_length = i_length*j_length ijk_length = i_length*j_length*k_length ! Begin Non-Blocking Sends do iq = nq1, nq2 !$omp parallel do private(i,j,k,inc,inc1) do k = k1, k2 inc1 = (win%offset_s) + ((iq-nq1)*ijk_length) & + ((k-k1)*ij_length) -i1+1 do j = j1, j2 inc = inc1 + (j-j1)*i_length do i = i1, i2 ga_s(inc+i) = q(i,j,k,iq) enddo enddo enddo enddo qsize = (i2-i1+1)*(j2-j1+1)*(k2-k1+1)*(nq2-nq1+1) send_tag = gidu win%nsend = win%nsend + 1 call MPI_ISEND(ga_s(win%offset_s+1), qsize, mp_r4, win%dest, & send_tag, comm, win%sqest(win%nsend), ierror) !EOC end subroutine Ga_Put4d_r4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_RecvInit_r4 --- Initiate real*4 Non-Blocking receive ! ! !INTERFACE: subroutine Ga_RecvInit_r4( comm, win, ga ) 2 ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(window), intent(inout) :: win ! Global Array Window ! !OUTPUT PARAMETERS: real(r4), intent(inout):: ga(win%size) ! ! !DESCRIPTION: ! ! Initiate real*8 Non-Blocking receive ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! 03.06.06 Sawyer Added else clause ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer qsize, recv_tag #if defined ( NOR4 ) write(iulog,*) 'Mod_comm: Ga_RecvInit_r4 - r4 windows disabled - exiting' call exit(1) #endif if (win%size >= win%offset_r + win%size_r) then recv_tag = win%src qsize = win%size_r win%nrecv = win%nrecv + 1 call MPI_IRECV(ga(win%offset_r+1), qsize, mp_r4, win%src, & recv_tag, comm, win%rqest(win%nrecv), ierror) else write(iulog,*) "Fatal ga_recvinit_r4: receive window out of space - exiting" write(iulog,*) 'gid win%size win%offset_r win%size_r = ', gid, & win%size, win%offset_r, win%size_r call exit(1) endif !EOC end subroutine Ga_RecvInit_r4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_Get4d_r4 --- Read from real*4 4d global array ! ! !INTERFACE: subroutine Ga_Get4d_r4 ( comm, q, win, im, jm, km, nq, & 2 ifrom, ito, jfrom, jto, kfrom, kto, & nqfrom, nqto, i1, i2, j1, j2, k1, k2, & nq1, nq2, ga ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(window), intent(inout) :: win ! Global Array Window integer, intent(in) :: im, jm, km, nq integer, intent(in) :: i1, i2, j1, j2, k1, k2, nq1, nq2 integer, intent(in) :: ifrom, ito, jfrom, jto, kfrom, kto, nqfrom, nqto real(r4), intent(in) :: ga(win%size) ! !OUTPUT PARAMETERS: real(r4), intent(inout) :: q(ifrom:ito, jfrom:jto, kfrom:kto, nqfrom:nqto) ! ! !DESCRIPTION: ! ! Read from real*8 4d global array. ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer i, j, k, iq, inc, inc1 integer i_length, j_length, k_length, ij_length, ijk_length #if defined ( NOR4 ) write(iulog,*) 'Mod_comm: Ga_Get4d_r4 - r4 windows disabled - exiting' call exit(1) #endif win%nread = win%nread + 1 call MPI_WAIT(win%rqest(win%nread), Status, ierror) i_length = i2-i1+1 j_length = j2-j1+1 k_length = k2-k1+1 ij_length = i_length*j_length ijk_length = i_length*j_length*k_length do iq = nq1, nq2 !$omp parallel do private(i,j,k,inc,inc1) do k = k1, k2 inc1 = (win%offset_r) + ((iq-nq1)*ijk_length) & + ((k-k1)*ij_length) -i1+1 do j = j1, j2 inc = inc1 + (j-j1)*i_length do i = i1, i2 q(i,j,k,iq) = ga(inc+i) enddo enddo enddo enddo !EOC end subroutine Ga_Get4d_r4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_Put4d_i4 --- Write to integer*4 4d global array ! ! !INTERFACE: subroutine Ga_Put4d_i4 ( comm, q, win, im, jm, km, nq, & ifrom, ito, jfrom, jto, kfrom, kto, & nqfrom, nqto, i1, i2, j1, j2, k1, k2, & nq1, nq2, ga_s, ga_r ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(window), intent(inout) :: win ! Global Array Window integer, intent(in) :: im, jm, km, nq integer, intent(in) :: ifrom, ito, jfrom, jto, kfrom, kto, nqfrom, nqto integer(i4), intent(in) :: q(ifrom:ito,jfrom:jto,kfrom:kto,nqfrom:nqto) integer, intent(in) :: i1, i2, j1, j2, k1, k2, nq1, nq2 ! !OUTPUT PARAMETERS: integer(i4), intent(inout):: ga_s(win%size) integer(i4), intent(inout):: ga_r(win%size) ! ! !DESCRIPTION: ! ! Write to integer*4 4d global array. ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer i, j, k, iq, inc, inc1 integer i_length, j_length, k_length, ij_length, ijk_length integer send_tag, qsize integer :: gidu call MPI_COMM_RANK (comm, gidu, ierror) i_length = i2-i1+1 j_length = j2-j1+1 k_length = k2-k1+1 ij_length = i_length*j_length ijk_length = i_length*j_length*k_length ! Begin Non-Blocking Sends do iq = nq1, nq2 !$omp parallel do private(i,j,k,inc,inc1) do k = k1, k2 inc1 = (win%offset_s) + ((iq-nq1)*ijk_length) & + ((k-k1)*ij_length) -i1+1 do j = j1, j2 inc = inc1 + (j-j1)*i_length do i = i1, i2 ga_s(inc+i) = q(i,j,k,iq) enddo enddo enddo enddo qsize = (i2-i1+1)*(j2-j1+1)*(k2-k1+1)*(nq2-nq1+1) send_tag = gidu win%nsend = win%nsend + 1 call MPI_ISEND(ga_s(win%offset_s+1), qsize, mp_i4, win%dest, & send_tag, comm, win%sqest(win%nsend), ierror) !EOC end subroutine Ga_Put4d_i4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_RecvInit_i4 --- Initiate integer*4 Non-Blocking receive ! ! !INTERFACE: subroutine Ga_RecvInit_i4( comm, win, ga ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(window), intent(inout) :: win ! Global Array Window ! !OUTPUT PARAMETERS: integer(i4), intent(inout):: ga(win%size) ! ! !DESCRIPTION: ! ! Initiate integer*4 Non-Blocking receive ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! 06.05.21 Mirin Added else clause ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer qsize, recv_tag if (win%size >= win%offset_r + win%size_r) then recv_tag = win%src qsize = win%size_r win%nrecv = win%nrecv + 1 call MPI_IRECV(ga(win%offset_r+1), qsize, mp_i4, win%src, & recv_tag, comm, win%rqest(win%nrecv), ierror) else write(iulog,*) "Fatal ga_recvinit_i4: receive window out of space - exiting" write(iulog,*) 'gid win%size win%offset_r win%size_r = ', gid, & win%size, win%offset_r, win%size_r call exit(1) endif !EOC end subroutine Ga_RecvInit_i4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_Get4d_i4 --- Read from integer*4 4d global array ! ! !INTERFACE: subroutine Ga_Get4d_i4 ( comm, q, win, im, jm, km, nq, & ifrom, ito, jfrom, jto, kfrom, kto, & nqfrom, nqto, i1, i2, j1, j2, k1, k2, & nq1, nq2, ga ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(window), intent(inout) :: win ! Global Array Window integer, intent(in) :: im, jm, km, nq integer, intent(in) :: i1, i2, j1, j2, k1, k2, nq1, nq2 integer, intent(in) :: ifrom, ito, jfrom, jto, kfrom, kto, nqfrom, nqto integer(i4), intent(in) :: ga(win%size) ! !OUTPUT PARAMETERS: integer(i4), intent(inout) :: q(ifrom:ito, jfrom:jto, kfrom:kto, nqfrom:nqto) ! ! !DESCRIPTION: ! ! Read from integer*4 4d global array. ! ! !REVISION HISTORY: ! 02.02.15 Putman ! 02.04.09 Putman Added ProTeX documentation ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer i, j, k, iq, inc, inc1 integer i_length, j_length, k_length, ij_length, ijk_length win%nread = win%nread + 1 call MPI_WAIT(win%rqest(win%nread), Status, ierror) i_length = i2-i1+1 j_length = j2-j1+1 k_length = k2-k1+1 ij_length = i_length*j_length ijk_length = i_length*j_length*k_length do iq = nq1, nq2 !$omp parallel do private(i,j,k,inc,inc1) do k = k1, k2 inc1 = (win%offset_r) + ((iq-nq1)*ijk_length) & + ((k-k1)*ij_length) -i1+1 do j = j1, j2 inc = inc1 + (j-j1)*i_length do i = i1, i2 q(i,j,k,iq) = ga(inc+i) enddo enddo enddo enddo !EOC end subroutine Ga_Get4d_i4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_Broadcast_r8 --- Broadcast an real*8 1d global array ! ! !INTERFACE: subroutine Ga_Broadcast_r8 ( comm, q, isize ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator integer, intent(in) :: isize ! !OUTPUT PARAMETERS: real(r8), intent(inout) :: q(isize) ! ! !DESCRIPTION: ! ! Broadcast an real*8 1d global array. ! ! !REVISION HISTORY: ! 03.04.02 Putman ! !EOP !------------------------------------------------------------------------------ !BOC ! !LOCAL VARIABLES: call MPI_BCAST(q, isize, mp_r8, 0, comm, ierror) !EOC end subroutine Ga_Broadcast_r8 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_Broadcast_r4 --- Broadcast an real*4 1d global array ! ! !INTERFACE: subroutine Ga_Broadcast_r4 ( comm, q, isize ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator integer, intent(in) :: isize ! !OUTPUT PARAMETERS: real(r4), intent(inout) :: q(isize) ! ! !DESCRIPTION: ! ! Broadcast an real*4 1d global array. ! ! !REVISION HISTORY: ! 03.04.02 Putman ! !EOP !------------------------------------------------------------------------------ !BOC ! !LOCAL VARIABLES: #if defined ( NOR4 ) write(iulog,*) 'Mod_comm: Ga_Broadcast_r4 - r4 windows disabled - exiting' call exit(1) #endif call MPI_BCAST(q, isize, mp_r4, 0, comm, ierror) !EOC end subroutine Ga_Broadcast_r4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_Broadcast_i4 --- Broadcast an integer*4 1d global array ! ! !INTERFACE: subroutine Ga_Broadcast_i4 ( comm, q, isize ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator integer, intent(in) :: isize ! !OUTPUT PARAMETERS: integer(i4), intent(inout) :: q(isize) ! ! !DESCRIPTION: ! ! Broadcast an integer*4 1d global array. ! ! !REVISION HISTORY: ! 03.04.02 Putman ! !EOP !------------------------------------------------------------------------------ !BOC ! !LOCAL VARIABLES: call MPI_BCAST(q, isize, mp_i4, 0, comm, ierror) !EOC end subroutine Ga_Broadcast_i4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_AllToAll_r8 --- All to All of an real*8 1d global array ! ! !INTERFACE: subroutine Ga_AllToAll_r8 ( comm, q, Gsize, Lsize, istart ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator integer, intent(in) :: Gsize ! Global size of array integer, intent(in) :: Lsize ! size of Local portion integer, intent(in) :: istart ! starting point ! !OUTPUT PARAMETERS: real(r8), intent(inout) :: q(Gsize) ! ! !DESCRIPTION: ! ! All to All of a real*8 1d global array. ! ! !REVISION HISTORY: ! 03.04.02 Putman ! !EOP !------------------------------------------------------------------------------ !BOC ! !LOCAL VARIABLES: call MPI_ALLGATHER(q(istart), Lsize, mp_r8, q, Lsize, mp_r8, comm, ierror) !EOC end subroutine Ga_AllToAll_r8 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_AllToAll_r4 --- All to All of an real*4 1d global array ! ! !INTERFACE: subroutine Ga_AllToAll_r4 ( comm, q, Gsize, Lsize, istart ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator integer, intent(in) :: Gsize ! Global size of array integer, intent(in) :: Lsize ! size of Local portion integer, intent(in) :: istart ! starting point ! !OUTPUT PARAMETERS: real(r4), intent(inout) :: q(Gsize) ! ! !DESCRIPTION: ! ! All to All of an real*4 1d global array. ! ! !REVISION HISTORY: ! 03.04.02 Putman ! !EOP !------------------------------------------------------------------------------ !BOC ! !LOCAL VARIABLES: #if defined ( NOR4 ) write(iulog,*) 'Mod_comm: Ga_AllToAll_r4 - r4 windows disabled - exiting' call exit(1) #endif call MPI_ALLGATHER(q(istart), Lsize, mp_r4, q, Lsize, mp_r4, comm, ierror) !EOC end subroutine Ga_AllToAll_r4 !------------------------------------------------------------------------------ !------------------------------------------------------------------------------ !BOP ! !ROUTINE: Ga_AllToAll_i4 --- All to All of an integer*4 1d global array ! ! !INTERFACE: subroutine Ga_AllToAll_i4 ( comm, q, Gsize, Lsize, istart ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator integer, intent(in) :: Gsize ! Global size of array integer, intent(in) :: Lsize ! size of Local portion integer, intent(in) :: istart ! starting point ! !OUTPUT PARAMETERS: integer(i4), intent(inout) :: q(Gsize) ! ! !DESCRIPTION: ! ! All to All of an integer*4 1d global array. ! ! !REVISION HISTORY: ! 03.04.02 Putman ! !EOP !------------------------------------------------------------------------------ !BOC ! !LOCAL VARIABLES: call MPI_ALLGATHER(q(istart), Lsize, mp_i4, q, Lsize, mp_i4, comm, ierror) !EOC end subroutine Ga_AllToAll_i4 !------------------------------------------------------------------------------ !BOP ! !ROUTINE: get_partneroffset --- Computes partneroffset/type from descriptor ! ! !INTERFACE: subroutine get_partneroffset ( comm, send_bl, recv_bl ) 10 ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator ! !INPUT/OUTPUT PARAMETERS: type(blockdescriptor), intent(inout) :: send_bl(:) ! send blocks type(blockdescriptor), intent(inout) :: recv_bl(:) ! receive blocks ! ! !DESCRIPTION: ! Compute partneroffsets/types from other blockdescriptor ! information. Used exclusively for irregular communication ! in PILGRIM. ! ! !REVISION HISTORY: ! 03.10.31 Mirin Creation ! ! !BUGS: ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer :: i, j, k, ns, pos, por, numpsq, ierror integer :: ami(numpro,numpro), am(numpro,numpro) integer mod_method, num_s, num_r num_s = size(send_bl) num_r = size(recv_bl) do j = 1, num_s send_bl(j)%partneroffset = 0 send_bl(j)%partnertype = MPI_DATATYPE_NULL enddo do j = 1, num_r recv_bl(j)%partneroffset = 0 recv_bl(j)%partnertype = MPI_DATATYPE_NULL enddo end subroutine get_partneroffset !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_sendirr --- Initiate communication of contiguous parcels ! ! !INTERFACE: subroutine mp_sendirr ( comm, send_bl, recv_bl, q1in, q1out, q2in, q2out, & 52,3 modc ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks real(r8), intent(in) :: q1in(*) ! input array real(r8), optional, intent(in) :: q2in(*) ! second input array integer, optional, intent(in) :: modc(4) ! 1: classical, swap p2p, swap a2a ! 2: handshake ! 3: send vs isend ! 4: max number of outstanding requests ! !OUTPUT PARAMETERS: real(r8), intent(out) :: q1out(*) ! output array real(r8), optional, intent(out) :: q2out(*) ! second output array ! ! !DESCRIPTION: ! Communicate a number of contiguous parcels to/from arbitrary set of PEs. ! Modc(1): if 0, use original approach of posting all communications here and placing ! wait points in mp_recvirr; if 1, call swap routine with p2p messages; if 2, call swap ! routine with a2a messages. ! Modc(2): if 1, then apply handshaking (don't send until corresponding receive is posted) ! Modc(3): if 1, then use blocking send; otherwise use nonblocking send ! Modc(4): maximum number of outstanding requests (applies to swap routines only) ! ! !REVISION HISTORY: ! 02.08.13 Sawyer Creation ! 02.11.06 Mirin Optimizations ! 03.03.03 Sawyer Use partneroffset ! 03.06.24 Sawyer Integrated Use_Mpi_Types; added qout ! 04.02.24 Mirin Various mpi2 options ! 08.09.18 Mirin Major overhaul, to include approaches from Mirin and Worley ! 09.10.07 Worley eliminated mpi_recv from handshake logic ! ! !BUGS: ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer ipe, qsize, offset, blocksize, nparcels, offset_s, offset_r, ierr, mod_method integer p, mysize, nthpc, minsize, nthrd, pn, pt, tmpsize, unitsize, offset_0 integer i, j, send_tag, recv_tag, num_s, num_r integer :: offset_v (Max_Nparcels) integer :: hs_rcvids(numpro) integer signal integer ipe2, ceil2num integer onetwo logical twovar integer sw_local, maxreq_local logical hs_local, send_local logical sw_alltoall integer comm_pid #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif if (present(modc)) then sw_local = modc(1) hs_local = (modc(2) .eq. 1) send_local = (modc(3) .eq. 1) maxreq_local = modc(4) else sw_local = 0 hs_local = .true. send_local = .true. maxreq_local = -1 endif ! Do not call mp_swapirr unless mod_method equals 0 mod_method = recv_bl(1)%method if (mod_method .gt. 0) sw_local = 0 onetwo = 1 twovar = .false. if (present(q2in)) then onetwo = 2 twovar = .true. endif if (sw_local .gt. 0) then sw_alltoall = (sw_local .eq. 2) if (present(q2in)) then call mp_swapirr(comm, send_bl, recv_bl, q1in, q1out, q2in, q2out, & sw_handshake=hs_local, sw_maxreq=maxreq_local, & sw_alltoall=sw_alltoall, sw_send=send_local) else call mp_swapirr(comm, send_bl, recv_bl, q1in, q1out, & sw_handshake=hs_local, sw_maxreq=maxreq_local, & sw_alltoall=sw_alltoall, sw_send=send_local) endif else call MPI_COMM_RANK (comm, comm_pid, ierr) signal = 1 ceil2num = ceil2(numpro) ! num_s = 0 if this processes is not part of the sending decomposition num_s = size(send_bl) if (send_bl(1)%Nparcels == -1) then num_s = 0 endif ! num_r = 0 if this processes is not part of the receiving decomposition num_r = size(recv_bl) if (recv_bl(1)%Nparcels == -1) then num_r = 0 endif r8_win%ncall_s = r8_win%ncall_s + 1 if (mod_method .gt. 0) then ! ! mpi derived types if (r8_win%ncall_s .gt. MaxTrf-onetwo+1) then write(iulog,*) "mp_sendirr: derived type handle count exceeded - exiting" write(iulog,*) "r8_win%ncall_s MaxTrf = ", r8_win%ncall_s, MaxTrf call exit(1) endif ! ! MPI: Irecv over all processes ! if (hs_local) then hs_rcvids(:) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then if (ipe-1 /= comm_pid) & call MPI_IRECV ( signal, 1, mp_i4, ipe-1, comm_pid, comm, hs_rcvids(ipe), ierr ) endif enddo endif OutHandle(:,r8_win%ncall_s) = MPI_REQUEST_NULL if (twovar) OutHandle(:,r8_win%ncall_s+1) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_r) cycle ! ! Receive the buffers with MPI_Irecv. Non-blocking ! if ( recv_bl(ipe)%type /= MPI_DATATYPE_NULL ) then recv_tag = ipe-1 + modcam_tagoffset call mpi_irecv( q1out, 1, recv_bl(ipe)%type, ipe-1, recv_tag, & comm, OutHandle(ipe,r8_win%ncall_s), ierr ) if (twovar) then call mpi_irecv( q2out, 1, recv_bl(ipe)%type, ipe-1, recv_tag, & comm, OutHandle(ipe,r8_win%ncall_s+1), ierr ) endif if (hs_local) then if (ipe-1 /= comm_pid) & call MPI_SEND ( signal, 1, mp_i4, ipe-1, ipe-1, comm, ierr ) endif endif enddo ! ! MPI: Isend/Send over all processes; use risend/rsend with hs ! InHandle(:,r8_win%ncall_s) = MPI_REQUEST_NULL if (twovar) InHandle(:,r8_win%ncall_s+1) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle ! ! Send the individual buffers with non-blocking sends ! if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then send_tag = comm_pid + modcam_tagoffset if (hs_local) then if (ipe-1 /= comm_pid) & call MPI_WAIT ( hs_rcvids(ipe), Status, ierr ) if (send_local) then call mpi_rsend( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, ierr ) else call mpi_irsend( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, InHandle(ipe,r8_win%ncall_s), ierr ) endif if (twovar) then if (send_local) then call mpi_rsend( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, ierr ) else call mpi_irsend( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, InHandle(ipe,r8_win%ncall_s+1), ierr ) endif endif else if (send_local) then call mpi_send( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, ierr ) else call mpi_isend( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, InHandle(ipe,r8_win%ncall_s), ierr ) endif if (twovar) then if (send_local) then call mpi_send( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, ierr ) else call mpi_isend( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, InHandle(ipe,r8_win%ncall_s+1), ierr ) endif endif endif endif enddo else ! temporary contiguous buffers if (r8_win%ncall_s .gt. max_irr-onetwo+1) then write(iulog,*) "mp_sendirr: insufficient window storage - exiting" write(iulog,*) "r8_win%ncall_s max_irr = ", r8_win%ncall_s, max_irr call exit(1) endif unitsize = r8_win%size/max_irr ! issue call to receive data in global receive buffer offset_0 = (r8_win%ncall_s-1)*unitsize offset_s = offset_0 offset_r = offset_0 if (hs_local) then hs_rcvids(:) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle qsize = onetwo*send_bl(ipe)%Tot_Size if (qsize .ne. 0) then r8_win%dest = ipe-1 send_tag = comm_pid + modcam_tagoffset if (r8_win%dest /= comm_pid) & call MPI_IRECV ( signal, 1, mp_i4, r8_win%dest, send_tag, comm, hs_rcvids(ipe), ierr ) endif enddo endif do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_r) cycle r8_win%size_r = onetwo*recv_bl(ipe)%Tot_Size if (r8_win%size_r .ne. 0) then r8_win%offset_r = offset_r offset_r = offset_r + r8_win%size_r r8_win%src = ipe-1 if (onetwo*unitsize >= offset_r-offset_0) then recv_tag = r8_win%src + modcam_tagoffset qsize = r8_win%size_r r8_win%nrecv = r8_win%nrecv + 1 call MPI_IRECV(ga_r8_r(r8_win%offset_r+1), qsize, mp_r8, r8_win%src, & recv_tag, comm, r8_win%rqest(r8_win%nrecv), ierror) if (hs_local) then if (r8_win%src /= comm_pid) & call MPI_SEND ( signal, 1, mp_i4, r8_win%src, recv_tag, comm, ierror) endif else write(iulog,*) "Fatal mp_sendirr: receive window out of space - exiting" write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif endif enddo ! gather data into global send buffer do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle qsize = onetwo*send_bl(ipe)%Tot_Size if (qsize .ne. 0) then r8_win%dest = ipe-1 r8_win%offset_s = offset_s offset_s = offset_s + qsize if (offset_s-offset_0 .gt. onetwo*unitsize) then write(iulog,*) "Fatal mp_sendirr: send window out of space - exiting" write(iulog,*) 'comm_pid ipe unitsize offset_s offset_0 = ', comm_pid, & ipe, unitsize, offset_s, offset_0 call exit(1) endif offset_v(1) = r8_win%offset_s do j = 2, send_bl(ipe)%nparcels offset_v(j) = offset_v(j-1) + send_bl(ipe)%blocksizes(j-1) enddo do j = 1, send_bl(ipe)%nparcels do i = 1, send_bl(ipe)%blocksizes(j) ga_r8_s(offset_v(j)+i) = q1in(send_bl(ipe)%displacements(j)+i) enddo enddo if (twovar) then do j = 1, send_bl(ipe)%nparcels do i = 1, send_bl(ipe)%blocksizes(j) ga_r8_s(send_bl(ipe)%Tot_Size+offset_v(j)+i) = q2in(send_bl(ipe)%displacements(j)+i) enddo enddo endif ! nonblocking send send_tag = comm_pid + modcam_tagoffset r8_win%nsend = r8_win%nsend + 1 if (hs_local) then if (r8_win%dest /= comm_pid) & call MPI_WAIT ( hs_rcvids(ipe), Status, ierr ) if (send_local) then call MPI_RSEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, & send_tag, comm, ierr) else call MPI_IRSEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, & send_tag, comm, r8_win%sqest(r8_win%nsend), ierr) endif else if (send_local) then call MPI_SEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, & send_tag, comm, ierr) else call MPI_ISEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, & send_tag, comm, r8_win%sqest(r8_win%nsend), ierr) endif endif endif enddo endif ! mod_method if (twovar) r8_win%ncall_s = r8_win%ncall_s + 1 endif ! sw_local #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif end subroutine mp_sendirr !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_recvirr --- Finalize communication of contiguous parcels ! ! !INTERFACE: subroutine mp_recvirr ( comm, send_bl, recv_bl, q1in, q1out, q2in, q2out, & 52,1 modc ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks real(r8), intent(in) :: q1in(*) ! input array real(r8), optional, intent(in) :: q2in(*) ! second input array integer, optional, intent(in) :: modc(4) ! 1: classical, swap p2p, swap a2a ! 2: handshake ! 3: send vs isend ! 4: max number of outstanding requests ! !INPUT/OUTPUT PARAMETERS: real(r8), intent(inout) :: q1out(*) ! output array real(r8), optional, intent(inout) :: q2out(*) ! second output array ! ! !DESCRIPTION: ! Complete transfer of a generalized region initiated by {\tt mp\_sendirr}. ! Communicate a number of contiguous parcels to/from arbitrary set of PEs. ! Modc(1): if 0, use original approach of posting all communications in mp_sendirr and ! placing wait points here; otherwise don't do anything - mp_swapirr is called from mp_sendirr. ! Modc(3): if 1, then use blocking send; otherwise use nonblocking send ! ! !REVISION HISTORY: ! 02.08.15 Sawyer Creation ! 02.11.06 Mirin Optimizations ! 03.03.03 Sawyer Now using packed arrays for MPI2 ! 04.02.24 Mirin Various mpi2 options ! 08.09.18 Mirin Major overhaul, to include approaches from Mirin and Worley ! !EOP !------------------------------------------------------------------------------ !BOC integer :: ipe, blocksize, offset_r, mod_method integer unitsize, offset_0 integer Ierr integer InStats(numpro*MPI_STATUS_SIZE) integer OutStats(numpro*MPI_STATUS_SIZE) integer i, j, num_r, num_s integer :: offset_v (Max_Nparcels) integer ipe2, ceil2num integer onetwo logical twovar integer sw_local, maxreq_local logical hs_local, send_local logical sw_alltoall integer comm_size, comm_pid if (present(modc)) then sw_local = modc(1) hs_local = (modc(2) .eq. 1) send_local = (modc(3) .eq. 1) maxreq_local = modc(4) else sw_local = 0 hs_local = .true. send_local = .true. maxreq_local = -1 endif ! Do not call mp_swapirr (hence return) unless mod_method equals 0 mod_method = recv_bl(1)%method if (mod_method .gt. 0) sw_local = 0 ! Return if swap_irr if (sw_local .gt. 0) return #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif onetwo = 1 twovar = .false. if (present(q2in)) then onetwo = 2 twovar = .true. endif call MPI_COMM_SIZE (comm, comm_size, ierr) call MPI_COMM_RANK (comm, comm_pid, ierr) ceil2num = ceil2(numpro) ! num_s = 0 if this processes is not part of the sending decomposition num_s = size(send_bl) if (send_bl(1)%Nparcels == -1) then num_s = 0 endif ! num_r = 0 if this processes is not part of the receiving decomposition num_r = size(recv_bl) if (recv_bl(1)%Nparcels == -1) then num_r = 0 endif r8_win%ncall_r = r8_win%ncall_r + 1 if (mod_method .gt. 0) then ! mpi derived types if (r8_win%ncall_r .gt. MaxTrf-onetwo+1) then write(iulog,*) "mp_recvirr: derived type handle count exceeded - exiting" write(iulog,*) "r8_win%ncall_r MaxTrf = ", r8_win%ncall_r, MaxTrf call exit(1) endif if (num_s .gt. 0 .and. (.not. send_local)) then CALL MPI_WAITALL( comm_size, InHandle(:,r8_win%ncall_r), InStats, Ierr ) if (twovar) then CALL MPI_WAITALL( comm_size, InHandle(:,r8_win%ncall_r+1), InStats, Ierr ) endif endif if (num_r .gt. 0) then CALL MPI_WAITALL( comm_size, OutHandle(:,r8_win%ncall_r), OutStats, Ierr ) if (twovar) then CALL MPI_WAITALL( comm_size, OutHandle(:,r8_win%ncall_r+1), OutStats, Ierr ) endif endif else ! temporary contiguous buffer / global window if (r8_win%ncall_r .gt. max_irr-onetwo+1) then write(iulog,*) "mp_recvirr: insufficient window storage - exiting" write(iulog,*) "r8_win%ncall_r max_irr = ", r8_win%ncall_r, max_irr call exit(1) endif unitsize = r8_win%size/max_irr ! scatter data from global receive buffer to final destination offset_0 = (r8_win%ncall_r-1)*unitsize offset_r = offset_0 do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_r) cycle r8_win%size_r = onetwo*recv_bl(ipe)%Tot_Size if (r8_win%size_r .ne. 0) then r8_win%offset_r = offset_r offset_r = offset_r + r8_win%size_r if (offset_r-offset_0 .gt. onetwo*unitsize) then write(iulog,*) "Fatal mp_recvirr: receive window out of space - exiting" write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif r8_win%nread = r8_win%nread + 1 call MPI_WAIT(r8_win%rqest(r8_win%nread), Status, ierr) offset_v(1) = r8_win%offset_r do j = 2, recv_bl(ipe)%Nparcels offset_v(j) = offset_v(j-1) + recv_bl(ipe)%blocksizes(j-1) enddo do j = 1, recv_bl(ipe)%Nparcels do i = 1, recv_bl(ipe)%blocksizes(j) q1out(recv_bl(ipe)%displacements(j)+i) = ga_r8_r(offset_v(j)+i) enddo enddo if (twovar) then do j = 1, recv_bl(ipe)%Nparcels do i = 1, recv_bl(ipe)%blocksizes(j) q2out(recv_bl(ipe)%displacements(j)+i) = ga_r8_r(recv_bl(ipe)%Tot_Size+offset_v(j)+i) enddo enddo endif endif enddo if ((r8_win%ncall_s == r8_win%ncall_r + onetwo - 1) .and. (.not. send_local)) then call MPI_WAITALL(r8_win%nsend, r8_win%sqest, Stats, ierror) endif endif ! mod_method .gt. 0 if (twovar) r8_win%ncall_r = r8_win%ncall_r + 1 if (r8_win%ncall_s == r8_win%ncall_r) then r8_win%nsend = 0 r8_win%nrecv = 0 r8_win%nread = 0 r8_win%ncall_s = 0 r8_win%ncall_r = 0 endif #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_recvirr !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_sendirr_r4 --- Initiate communication of contiguous parcels - r4 ! ! !INTERFACE: subroutine mp_sendirr_r4 ( comm, send_bl, recv_bl, q1in, q1out, q2in, q2out, & 4 modc ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks real(r4), intent(in) :: q1in(*) ! input array real(r4), optional, intent(in) :: q2in(*) ! second input array integer, optional, intent(in) :: modc(4) ! 1: classical, swap p2p, swap a2a ! 2: handshake ! 3: send vs isend ! 4: max number of outstanding requests ! !OUTPUT PARAMETERS: real(r4), intent(out) :: q1out(*) ! output array real(r4), optional, intent(out) :: q2out(*) ! second output array ! ! !DESCRIPTION: ! Communicate a number of contiguous parcels to/from arbitrary set of PEs. ! Modc(1): if 0, use original approach of posting all communications here and placing ! wait points in mp_recvirr; if 1, call swap routine with p2p messages; if 2, call swap ! routine with a2a messages. ! Modc(2): if 1, then apply handshaking (don't send until corresponding receive is posted) ! Modc(3): if 1, then use blocking send; otherwise use nonblocking send ! Modc(4): maximum number of outstanding requests (applies to swap routines only) ! ! !REVISION HISTORY: ! 02.08.13 Sawyer Creation ! 02.11.06 Mirin Optimizations ! 03.03.03 Sawyer Use partneroffset ! 03.06.24 Sawyer Integrated Use_Mpi_Types; added qout ! 04.02.24 Mirin Various mpi2 options ! 08.09.18 Mirin No-op version ! ! !BUGS: ! !EOP !------------------------------------------------------------------------------ !BOC ! write(iulog,*) 'Mod_comm: mp_sendirr_r4 - r4 no longer supported - exiting' call exit(1) !EOC end subroutine mp_sendirr_r4 !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_recvirr_r4 --- Finalize communication of contiguous parcels - r4 ! ! !INTERFACE: subroutine mp_recvirr_r4 ( comm, send_bl, recv_bl, q1in, q1out, q2in, q2out, & 4 modc ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks real(r4), intent(in) :: q1in(*) ! input array real(r4), optional, intent(in) :: q2in(*) ! second input array integer, optional, intent(in) :: modc(4) ! 1: classical, swap p2p, swap a2a ! 2: handshake ! 3: send vs isend ! 4: max number of outstanding requests ! !INPUT/OUTPUT PARAMETERS: real(r4), intent(inout) :: q1out(*) ! output array real(r4), optional, intent(inout) :: q2out(*) ! second output array ! ! !DESCRIPTION: ! Complete transfer of a generalized region initiated by {\tt mp\_sendirr}. ! Communicate a number of contiguous parcels to/from arbitrary set of PEs. ! Modc(1): if 0, use original approach of posting all communications in mp_sendirr and ! placing wait points here; otherwise don't do anything - mp_swapirr is called from mp_sendirr. ! Modc(3): if 1, then use blocking send; otherwise use nonblocking send ! ! !REVISION HISTORY: ! 02.08.15 Sawyer Creation ! 02.11.06 Mirin Optimizations ! 03.03.03 Sawyer Now using packed arrays for MPI2 ! 04.02.24 Mirin Various mpi2 options ! 08.09.18 Mirin No-op version ! ! !BUGS: ! !EOP !------------------------------------------------------------------------------ !BOC ! write(iulog,*) 'Mod_comm: mp_recvirr_r4 - r4 no longer supported - exiting' call exit(1) !EOC end subroutine mp_recvirr_r4 !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_sendirr_i4 --- Initiate communication of contiguous parcels - i4 ! ! !INTERFACE: subroutine mp_sendirr_i4 ( comm, send_bl, recv_bl, q1in, q1out, q2in, q2out, & 4,3 modc ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks integer(i4), intent(in) :: q1in(*) ! input array integer(i4), optional, intent(in) :: q2in(*) ! second input array integer, optional, intent(in) :: modc(4) ! 1: classical, swap p2p, swap a2a ! 2: handshake ! 3: send vs isend ! 4: max number of outstanding requests ! !OUTPUT PARAMETERS: integer(i4), intent(out) :: q1out(*) ! output array integer(i4), optional, intent(out) :: q2out(*) ! second output array ! ! !DESCRIPTION: ! Communicate a number of contiguous parcels to/from arbitrary set of PEs. ! Modc(1): if 0, use original approach of posting all communications here and placing ! wait points in mp_recvirr; if 1, call swap routine with p2p messages; if 2, call swap ! routine with a2a messages. ! Modc(2): if 1, then apply handshaking (don't send until corresponding receive is posted) ! Modc(3): if 1, then use blocking send; otherwise use nonblocking send ! Modc(4): maximum number of outstanding requests (applies to swap routines only) ! ! !REVISION HISTORY: ! 02.08.13 Sawyer Creation ! 02.11.06 Mirin Optimizations ! 03.03.03 Sawyer Use partneroffset ! 03.06.24 Sawyer Integrated Use_Mpi_Types; added qout ! 04.02.24 Mirin Various mpi2 options ! 08.09.18 Mirin Major overhaul, to include approaches from Mirin and Worley ! 09.10.07 Worley eliminated mpi_recv from handshake logic ! ! !BUGS: ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer ipe, qsize, offset, blocksize, nparcels, offset_s, offset_r, ierr, mod_method integer p, mysize, nthpc, minsize, nthrd, pn, pt, tmpsize, unitsize, offset_0 integer i, j, send_tag, recv_tag, num_s, num_r integer :: offset_v (Max_Nparcels) integer :: hs_rcvids(numpro) integer signal integer ipe2, ceil2num integer onetwo logical twovar integer sw_local, maxreq_local logical hs_local, send_local logical sw_alltoall integer comm_pid #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif if (present(modc)) then sw_local = modc(1) hs_local = (modc(2) .eq. 1) send_local = (modc(3) .eq. 1) maxreq_local = modc(4) else sw_local = 0 hs_local = .true. send_local = .false. maxreq_local = -1 endif ! Do not call mp_swapirr_i4 unless mod_method equals 0 mod_method = recv_bl(1)%method if (mod_method .gt. 0) sw_local = 0 onetwo = 1 twovar = .false. if (present(q2in)) then onetwo = 2 twovar = .true. endif if (sw_local .gt. 0) then sw_alltoall = (sw_local .eq. 2) if (present(q2in)) then call mp_swapirr_i4(comm, send_bl, recv_bl, q1in, q1out, q2in, q2out, & sw_handshake=hs_local, sw_maxreq=maxreq_local, & sw_alltoall=sw_alltoall, sw_send=send_local) else call mp_swapirr_i4(comm, send_bl, recv_bl, q1in, q1out, & sw_handshake=hs_local, sw_maxreq=maxreq_local, & sw_alltoall=sw_alltoall, sw_send=send_local) endif else call MPI_COMM_RANK (comm, comm_pid, ierr) signal = 1 ceil2num = ceil2(numpro) ! num_s = 0 if this processes is not part of the sending decomposition num_s = size(send_bl) if (send_bl(1)%Nparcels == -1) then num_s = 0 endif ! num_r = 0 if this processes is not part of the receiving decomposition num_r = size(recv_bl) if (recv_bl(1)%Nparcels == -1) then num_r = 0 endif mod_method = recv_bl(1)%method i4_win%ncall_s = i4_win%ncall_s + 1 if (mod_method .gt. 0) then ! ! mpi derived types if (i4_win%ncall_s .gt. MaxTrf-onetwo+1) then write(iulog,*) "mp_sendirr_i4: derived type handle count exceeded - exiting" write(iulog,*) "i4_win%ncall_s MaxTrf = ", i4_win%ncall_s, MaxTrf call exit(1) endif ! ! MPI: Irecv over all processes ! if (hs_local) then hs_rcvids(:) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then if (ipe-1 /= comm_pid) & call MPI_IRECV ( signal, 1, mp_i4, ipe-1, comm_pid, comm, hs_rcvids(ipe), ierr ) endif enddo endif OutHandle(:,i4_win%ncall_s) = MPI_REQUEST_NULL if (twovar) OutHandle(:,i4_win%ncall_s+1) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_r) cycle ! ! Receive the buffers with MPI_Irecv. Non-blocking ! if ( recv_bl(ipe)%type /= MPI_DATATYPE_NULL ) then recv_tag = ipe-1 + modcam_tagoffset call mpi_irecv( q1out, 1, recv_bl(ipe)%type, ipe-1, recv_tag, & comm, OutHandle(ipe,i4_win%ncall_s), ierr ) if (twovar) then call mpi_irecv( q2out, 1, recv_bl(ipe)%type, ipe-1, recv_tag, & comm, OutHandle(ipe,i4_win%ncall_s+1), ierr ) endif if (hs_local) then if (ipe-1 /= comm_pid) & call MPI_SEND ( signal, 1, mp_i4, ipe-1, ipe-1, comm, ierr ) endif endif enddo ! ! MPI: Isend/Send over all processes; use risend/rsend with hs ! InHandle(:,i4_win%ncall_s) = MPI_REQUEST_NULL if (twovar) InHandle(:,i4_win%ncall_s+1) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle ! ! Send the individual buffers with non-blocking sends ! if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then send_tag = comm_pid + modcam_tagoffset if (hs_local) then if (ipe-1 /= comm_pid) & call MPI_WAIT ( hs_rcvids(ipe), Status, ierr ) if (send_local) then call mpi_rsend( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, ierr ) else call mpi_irsend( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, InHandle(ipe,i4_win%ncall_s), ierr ) endif if (twovar) then if (send_local) then call mpi_rsend( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, ierr ) else call mpi_irsend( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, InHandle(ipe,i4_win%ncall_s+1), ierr ) endif endif else if (send_local) then call mpi_send( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, ierr ) else call mpi_isend( q1in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, InHandle(ipe,i4_win%ncall_s), ierr ) endif if (twovar) then if (send_local) then call mpi_send( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, ierr ) else call mpi_isend( q2in, 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, InHandle(ipe,i4_win%ncall_s+1), ierr ) endif endif endif endif enddo else ! temporary contiguous buffers if (i4_win%ncall_s .gt. max_irr-onetwo+1) then write(iulog,*) "mp_sendirr_i4: insufficient window storage - exiting" write(iulog,*) "i4_win%ncall_s max_irr = ", i4_win%ncall_s, max_irr call exit(1) endif unitsize = i4_win%size/max_irr ! issue call to receive data in global receive buffer offset_0 = (i4_win%ncall_s-1)*unitsize offset_s = offset_0 offset_r = offset_0 if (hs_local) then hs_rcvids(:) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle qsize = onetwo*send_bl(ipe)%Tot_Size if (qsize .ne. 0) then i4_win%dest = ipe-1 send_tag = comm_pid + modcam_tagoffset if (i4_win%dest /= comm_pid) & call MPI_IRECV ( signal, 1, mp_i4, i4_win%dest, send_tag, comm, hs_rcvids(ipe), ierr ) endif enddo endif do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_r) cycle i4_win%size_r = onetwo*recv_bl(ipe)%Tot_Size if (i4_win%size_r .ne. 0) then i4_win%offset_r = offset_r offset_r = offset_r + i4_win%size_r i4_win%src = ipe-1 if (onetwo*unitsize >= offset_r-offset_0) then recv_tag = i4_win%src + modcam_tagoffset qsize = i4_win%size_r i4_win%nrecv = i4_win%nrecv + 1 call MPI_IRECV(ga_i4_r(i4_win%offset_r+1), qsize, mp_i4, i4_win%src, & recv_tag, comm, i4_win%rqest(i4_win%nrecv), ierror) if (hs_local) then if (i4_win%src /= comm_pid) & call MPI_SEND ( signal, 1, mp_i4, i4_win%src, recv_tag, comm, ierror) endif else write(iulog,*) "Fatal mp_sendirr_i4: receive window out of space - exiting" write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif endif enddo ! gather data into global send buffer do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle qsize = onetwo*send_bl(ipe)%Tot_Size if (qsize .ne. 0) then i4_win%dest = ipe-1 i4_win%offset_s = offset_s offset_s = offset_s + qsize if (offset_s-offset_0 .gt. onetwo*unitsize) then write(iulog,*) "Fatal mp_sendirr_i4: send window out of space - exiting" write(iulog,*) 'comm_pid ipe unitsize offset_s offset_0 = ', comm_pid, & ipe, unitsize, offset_s, offset_0 call exit(1) endif offset_v(1) = i4_win%offset_s do j = 2, send_bl(ipe)%nparcels offset_v(j) = offset_v(j-1) + send_bl(ipe)%blocksizes(j-1) enddo do j = 1, send_bl(ipe)%nparcels do i = 1, send_bl(ipe)%blocksizes(j) ga_i4_s(offset_v(j)+i) = q1in(send_bl(ipe)%displacements(j)+i) enddo enddo if (twovar) then do j = 1, send_bl(ipe)%nparcels do i = 1, send_bl(ipe)%blocksizes(j) ga_i4_s(send_bl(ipe)%Tot_Size+offset_v(j)+i) = q2in(send_bl(ipe)%displacements(j)+i) enddo enddo endif ! nonblocking send send_tag = comm_pid + modcam_tagoffset i4_win%nsend = i4_win%nsend + 1 if (hs_local) then if (i4_win%dest /= comm_pid) & call MPI_WAIT ( hs_rcvids(ipe), Status, ierr ) if (send_local) then call MPI_RSEND(ga_i4_s(i4_win%offset_s+1), qsize, mp_i4, i4_win%dest, & send_tag, comm, ierr) else call MPI_IRSEND(ga_i4_s(i4_win%offset_s+1), qsize, mp_i4, i4_win%dest, & send_tag, comm, i4_win%sqest(i4_win%nsend), ierr) endif else if (send_local) then call MPI_SEND(ga_i4_s(i4_win%offset_s+1), qsize, mp_i4, i4_win%dest, & send_tag, comm, ierr) else call MPI_ISEND(ga_i4_s(i4_win%offset_s+1), qsize, mp_i4, i4_win%dest, & send_tag, comm, i4_win%sqest(i4_win%nsend), ierr) endif endif endif enddo endif ! mod_method if (twovar) i4_win%ncall_s = i4_win%ncall_s + 1 endif ! sw_local #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif end subroutine mp_sendirr_i4 !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_recvirr_i4 --- Finalize communication of contiguous parcels - i4 ! ! !INTERFACE: subroutine mp_recvirr_i4 ( comm, send_bl, recv_bl, q1in, q1out, q2in, q2out, & 4,1 modc ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks integer(i4), intent(in) :: q1in(*) ! input array integer(i4), optional, intent(in) :: q2in(*) ! second input array integer, optional, intent(in) :: modc(4) ! 1: classical, swap p2p, swap a2a ! 2: handshake ! 3: send vs isend ! 4: max number of outstanding requests ! !INPUT/OUTPUT PARAMETERS: integer(i4), intent(inout) :: q1out(*) ! output array integer(i4), optional, intent(inout) :: q2out(*) ! second output array ! ! !DESCRIPTION: ! Complete transfer of a generalized region initiated by {\tt mp\_sendirr}. ! Communicate a number of contiguous parcels to/from arbitrary set of PEs. ! Modc(1): if 0, use original approach of posting all communications in mp_sendirr and ! placing wait points here; otherwise don't do anything - mp_swapirr is called from mp_sendirr. ! Modc(3): if 1, then use blocking send; otherwise use nonblocking send ! ! !REVISION HISTORY: ! 02.08.15 Sawyer Creation ! 02.11.06 Mirin Optimizations ! 03.03.03 Sawyer Now using packed arrays for MPI2 ! 04.02.24 Mirin Various mpi2 options ! 08.09.18 Mirin Major overhaul, to include approaches from Mirin and Worley ! !EOP !------------------------------------------------------------------------------ !BOC integer :: ipe, blocksize, offset_r, mod_method integer unitsize, offset_0 integer Ierr integer InStats(numpro*MPI_STATUS_SIZE) integer OutStats(numpro*MPI_STATUS_SIZE) integer i, j, num_r, num_s integer :: offset_v (Max_Nparcels) integer ipe2, ceil2num integer onetwo logical twovar integer sw_local, maxreq_local logical hs_local, send_local logical sw_alltoall integer comm_size, comm_pid if (present(modc)) then sw_local = modc(1) hs_local = (modc(2) .eq. 1) send_local = (modc(3) .eq. 1) maxreq_local = modc(4) else sw_local = 0 hs_local = .true. send_local = .false. maxreq_local = -1 endif ! Do not call mp_swapirr_i4 (hence return) unless mod_method equals 0 mod_method = recv_bl(1)%method if (mod_method .gt. 0) sw_local = 0 ! Return if swap_irr if (sw_local .gt. 0) return #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif onetwo = 1 twovar = .false. if (present(q2in)) then onetwo = 2 twovar = .true. endif call MPI_COMM_SIZE (comm, comm_size, ierr) call MPI_COMM_RANK (comm, comm_pid, ierr) ceil2num = ceil2(numpro) ! num_s = 0 if this processes is not part of the sending decomposition num_s = size(send_bl) if (send_bl(1)%Nparcels == -1) then num_s = 0 endif ! num_r = 0 if this processes is not part of the receiving decomposition num_r = size(recv_bl) if (recv_bl(1)%Nparcels == -1) then num_r = 0 endif mod_method = recv_bl(1)%method i4_win%ncall_r = i4_win%ncall_r + 1 if (mod_method .gt. 0) then ! mpi derived types if (i4_win%ncall_r .gt. MaxTrf-onetwo+1) then write(iulog,*) "mp_recvirr_i4: derived type handle count exceeded - exiting" write(iulog,*) "i4_win%ncall_r MaxTrf = ", i4_win%ncall_r, MaxTrf call exit(1) endif if (num_s .gt. 0 .and. (.not. send_local)) then CALL MPI_WAITALL( comm_size, InHandle(:,i4_win%ncall_r), InStats, Ierr ) if (twovar) then CALL MPI_WAITALL( comm_size, InHandle(:,i4_win%ncall_r+1), InStats, Ierr ) endif endif if (num_r .gt. 0) then CALL MPI_WAITALL( comm_size, OutHandle(:,i4_win%ncall_r), OutStats, Ierr ) if (twovar) then CALL MPI_WAITALL( comm_size, OutHandle(:,i4_win%ncall_r+1), OutStats, Ierr ) endif endif else ! temporary contiguous buffer / global window if (i4_win%ncall_r .gt. max_irr-onetwo+1) then write(iulog,*) "mp_recvirr_i4: insufficient window storage - exiting" write(iulog,*) "i4_win%ncall_r max_irr = ", i4_win%ncall_r, max_irr call exit(1) endif unitsize = i4_win%size/max_irr ! scatter data from global receive buffer to final destination offset_0 = (i4_win%ncall_r-1)*unitsize offset_r = offset_0 do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_r) cycle i4_win%size_r = onetwo*recv_bl(ipe)%Tot_Size if (i4_win%size_r .ne. 0) then i4_win%offset_r = offset_r offset_r = offset_r + i4_win%size_r if (offset_r-offset_0 .gt. onetwo*unitsize) then write(iulog,*) "Fatal mp_recvirr_i4: receive window out of space - exiting" write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif i4_win%nread = i4_win%nread + 1 call MPI_WAIT(i4_win%rqest(i4_win%nread), Status, ierr) offset_v(1) = i4_win%offset_r do j = 2, recv_bl(ipe)%Nparcels offset_v(j) = offset_v(j-1) + recv_bl(ipe)%blocksizes(j-1) enddo do j = 1, recv_bl(ipe)%Nparcels do i = 1, recv_bl(ipe)%blocksizes(j) q1out(recv_bl(ipe)%displacements(j)+i) = ga_i4_r(offset_v(j)+i) enddo enddo if (twovar) then do j = 1, recv_bl(ipe)%Nparcels do i = 1, recv_bl(ipe)%blocksizes(j) q2out(recv_bl(ipe)%displacements(j)+i) = ga_i4_r(recv_bl(ipe)%Tot_Size+offset_v(j)+i) enddo enddo endif endif enddo if ((i4_win%ncall_s == i4_win%ncall_r + onetwo - 1) .and. (.not. send_local)) then call MPI_WAITALL(i4_win%nsend, i4_win%sqest, Stats, ierror) endif endif ! mod_method .gt. 0 if (twovar) i4_win%ncall_r = i4_win%ncall_r + 1 if (i4_win%ncall_s == i4_win%ncall_r) then i4_win%nsend = 0 i4_win%nrecv = 0 i4_win%nread = 0 i4_win%ncall_s = 0 i4_win%ncall_r = 0 endif #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_recvirr_i4 !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_swapirr --- Write r8 contiguous parcels to global array ! using XOR swap ordering ! ! !INTERFACE: subroutine mp_swapirr ( comm, send_bl, recv_bl, a1in, a1out, & 2,2 a2in, a2out, sw_handshake, sw_maxreq, & sw_alltoall, sw_send ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks real(r8), intent(in) :: a1in(*) ! local data segment real(r8), optional, intent(in) :: a2in(*) ! local data segment logical, optional, intent(in) :: sw_handshake ! use flow control and ! ready send integer, optional, intent(in) :: sw_maxreq ! maximum number of outstanding ! MPI requests logical, optional, intent(in) :: sw_alltoall ! use mpi_alltoall logical, optional, intent(in) :: sw_send ! use mpi_send instead of isend ! !OUTPUT PARAMETERS: real(r8), intent(out) :: a1out(*) ! local output segment real(r8), optional, intent(out) :: a2out(*) ! local output segment ! ! !DESCRIPTION: ! XOR-ordered version of all-to-all communication ! ! WARNING: mod_comm parameter max_irr might need to be set larger than expected ! when swapping two variables; specifically, max_irr must be at least ! as large as the incoming r8_win%ncall_s + the number of variables to ! be swapped ! ! !REVISION HISTORY: ! 08.06.30 Worley original: derived from mp_sendirr, but using ! swapm logic and XOR swap order ! 08.08.22 Worley removed swapm; reimplemented with native MPI, ! added flow control/ready send option and maxreq ! throttling, added alltoall option ! 09.10.07 Worley eliminated mpi_recv from handshake logic ! ! !BUGS: ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer :: i, j, p, istep, num_s, num_r integer :: comm_pid, comm_size, steps, ierr integer :: ipe, offset_s, offset_r, offset_0, unitsize, onetwo integer :: arr_sndlths(0:numpro-1), arr_rcvlths(0:numpro-1) integer :: sndlths(0:numpro-1), sdispls(0:numpro-1) integer :: rcvlths(0:numpro-1), rdispls(0:numpro-1) integer :: swapids(numpro) integer :: sndids(numpro) ! nonblocking MPI send request ids integer :: rcvids(numpro) ! nonblocking MPI recv request ids integer :: hs_rcvids(numpro) ! nonblocking MPI handshake recv request ids integer :: InStats(numpro*MPI_STATUS_SIZE) integer :: OutStats(numpro*MPI_STATUS_SIZE) integer :: offset_v integer :: rstep integer :: maxreq, maxreqh, hs logical :: handshake, alltoall, sendd #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_SIZE (comm, comm_size, ierr) call MPI_COMM_RANK (comm, comm_pid, ierr) ! num_s = 0 if this process is not part of the sending decomposition num_s = size(send_bl) if (send_bl(1)%Nparcels == -1) then num_s = 0 endif ! num_r = 0 if this process is not part of the receiving decomposition num_r = size(recv_bl) if (recv_bl(1)%Nparcels == -1) then num_r = 0 endif if ( present(a2in) .and. (.not. present(a2out)) ) then write(iulog,*) "Fatal mp_swapirr: a2in specified, but a2out missing - exiting" call exit(1) endif if ( (.not. present(a2in)) .and. present(a2out)) then write(iulog,*) "Fatal mp_swapirr: a2out specified, but a2in missing - exiting" call exit(1) endif if ( present(sw_handshake) ) then handshake = sw_handshake else handshake = .false. endif if ( present(sw_alltoall) ) then alltoall = sw_alltoall else alltoall = .false. endif if ( present(sw_send) ) then sendd = sw_send else sendd = .false. endif onetwo = 1 if (present(a2in)) onetwo = 2 unitsize = r8_win%size/max_irr ! advance to unused portion of storage window r8_win%ncall_s = r8_win%ncall_s + 1 if (r8_win%ncall_s .gt. max_irr-onetwo+1) then write(iulog,*) "mp_swapirr: insufficient window storage - exiting" write(iulog,*) "r8_win%ncall_s max_irr = ", r8_win%ncall_s, max_irr call exit(1) endif ! calculate send lengths and displacements offset_0 = (r8_win%ncall_s-1)*unitsize offset_s = offset_0 sndlths(:) = 0 sdispls(:) = 0 arr_sndlths(:) = 0 do ipe=1, num_s sndlths(ipe-1) = send_bl(ipe)%Tot_Size sdispls(ipe-1) = offset_s if (sndlths(ipe-1) .ne. 0) then ! pack first array offset_s = offset_s + sndlths(ipe-1) if (offset_s-offset_0 .gt. onetwo*unitsize) then write(iulog,*) "Fatal mp_swapirr: send window out of space - exiting" write(iulog,*) '1 comm_pid ipe unitsize offset_s offset_0 = ', comm_pid, & ipe, unitsize, offset_s, offset_0 call exit(1) endif arr_sndlths(ipe-1) = sndlths(ipe-1) ! calculate for second array (if it exists) if ( present(a2in) ) then offset_s = offset_s + sndlths(ipe-1) if (offset_s-offset_0 .gt. onetwo*unitsize) then write(iulog,*) "Fatal mp_swapirr: send window out of space - exiting" write(iulog,*) '2 comm_pid ipe unitsize offset_s offset_0 = ', comm_pid, & ipe, unitsize, offset_s, offset_0 call exit(1) endif sndlths(ipe-1) = sndlths(ipe-1) + arr_sndlths(ipe-1) endif endif enddo ! calculate receive lengths and displacements offset_r = offset_0 rcvlths(:) = 0 rdispls(:) = 0 arr_rcvlths(:) = 0 do ipe=1, num_r rcvlths(ipe-1) = recv_bl(ipe)%Tot_Size rdispls(ipe-1) = offset_r if (rcvlths(ipe-1) .ne. 0) then offset_r = offset_r + rcvlths(ipe-1) if (onetwo*unitsize < offset_r-offset_0) then write(iulog,*) "Fatal mp_swapirr: receive window out of space - exiting" write(iulog,*) '1 comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif arr_rcvlths(ipe-1) = rcvlths(ipe-1) ! compute for second array (if it exists) if ( present(a2out) ) then offset_r = offset_r + rcvlths(ipe-1) if (onetwo*unitsize < offset_r-offset_0) then write(iulog,*) "Fatal mp_swapirr: receive window out of space - exiting" write(iulog,*) '2 comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif rcvlths(ipe-1) = rcvlths(ipe-1) + arr_rcvlths(ipe-1) endif endif enddo ! Calculate swap partners and number of steps in point-to-point ! implementations of alltoall algorithm. steps = 0 do ipe=1,ceil2(comm_size)-1 p = pair(comm_size,ipe,comm_pid) if (p >= 0) then if (sndlths(p) > 0 .or. rcvlths(p) > 0) then steps = steps + 1 swapids(steps) = p end if end if end do if (.not. alltoall) then sndids(1:steps) = MPI_REQUEST_NULL rcvids(1:steps) = MPI_REQUEST_NULL if (steps .eq. 0) then maxreq = 0 maxreqh = 0 elseif (steps .eq. 1) then maxreq = 1 maxreqh = 1 else if ( present(sw_maxreq) ) then if ((sw_maxreq .le. steps) .and. (sw_maxreq .ge. 0)) then maxreq = sw_maxreq if (maxreq > 1) then maxreqh = maxreq/2 else maxreq = 2 maxreqh = 1 endif else maxreq = steps maxreqh = steps endif else maxreq = steps maxreqh = steps endif endif ! Post initial handshake receive requests if (handshake) then do istep=1,maxreq p = swapids(istep) if (sndlths(p) > 0) then call mpi_irecv ( hs, 1, mp_i4, p, comm_pid, comm, & hs_rcvids(istep), ierr ) endif enddo endif ! Post initial receive requests do istep=1,maxreq p = swapids(istep) if (rcvlths(p) > 0) then offset_r = rdispls(p)+1 call mpi_irecv ( ga_r8_r(offset_r), rcvlths(p), mp_r8, & p, p, comm, rcvids(istep), ierr ) if (handshake) then call mpi_send( hs, 1, mp_i4, p, p, comm, & ierr ) endif endif enddo rstep = maxreq ! endif ! gather data into global send buffer do istep=1,steps p = swapids(istep) if (sndlths(p) .ne. 0) then offset_v = sdispls(p) do j = 1, send_bl(p+1)%nparcels do i = 1, send_bl(p+1)%blocksizes(j) ga_r8_s(offset_v+i) = a1in(send_bl(p+1)%displacements(j)+i) enddo offset_v = offset_v + send_bl(p+1)%blocksizes(j) enddo ! pack second array (if it exists) if ( present(a2in) ) then offset_v = sdispls(p) + arr_sndlths(p) do j = 1, send_bl(p+1)%nparcels do i = 1, send_bl(p+1)%blocksizes(j) ga_r8_s(offset_v+i) = a2in(send_bl(p+1)%displacements(j)+i) enddo offset_v = offset_v + send_bl(p+1)%blocksizes(j) enddo endif endif if (.not. alltoall) then ! Submit new i(r)send request offset_s = sdispls(p)+1 if (sndlths(p) > 0) then if (handshake) then call mpi_wait( hs_rcvids(istep), Status, ierr ) if (sendd) then call mpi_rsend( ga_r8_s(offset_s), sndlths(p), mp_r8, & p, comm_pid, comm, ierr ) else call mpi_irsend( ga_r8_s(offset_s), sndlths(p), mp_r8, & p, comm_pid, comm, sndids(istep), ierr ) endif else if (sendd) then call mpi_send ( ga_r8_s(offset_s), sndlths(p), mp_r8, & p, comm_pid, comm, ierr ) else call mpi_isend ( ga_r8_s(offset_s), sndlths(p), mp_r8, & p, comm_pid, comm, sndids(istep), ierr ) endif endif endif if (istep > maxreqh) then ! Wait for oldest irecv request to complete call mpi_wait( rcvids(istep-maxreqh), OutStats, ierr ) if (rstep < steps) then rstep = rstep + 1 p = swapids(rstep) ! Submit a new handshake irecv request if (handshake) then if (sndlths(p) > 0) then call mpi_irecv( hs, 1, mp_i4, p, comm_pid, comm, & hs_rcvids(rstep), ierr ) endif endif ! Submit a new irecv request if (rcvlths(p) > 0) then offset_r = rdispls(p)+1 call mpi_irecv( ga_r8_r(offset_r), rcvlths(p), mp_r8, & p, p, comm, rcvids(rstep), ierr ) if (handshake) then call mpi_send ( hs, 1, mp_i4, p, p, comm, & ierr ) endif endif endif ! Wait for outstanding i(r)send request to complete if (.not. sendd) then call mpi_wait( sndids(istep-maxreqh), InStats, ierr ) endif endif ! endif ! enddo ! local copy to send buffer if (sndlths(comm_pid) .ne. 0) then offset_v = sdispls(comm_pid) do j = 1, send_bl(comm_pid+1)%nparcels do i = 1, send_bl(comm_pid+1)%blocksizes(j) ga_r8_s(offset_v+i) = a1in(send_bl(comm_pid+1)%displacements(j)+i) enddo offset_v = offset_v + send_bl(comm_pid+1)%blocksizes(j) enddo ! pack second array (if it exists) if ( present(a2in) ) then offset_v = sdispls(comm_pid) + arr_sndlths(comm_pid) do j = 1, send_bl(comm_pid+1)%nparcels do i = 1, send_bl(comm_pid+1)%blocksizes(j) ga_r8_s(offset_v+i) = a2in(send_bl(comm_pid+1)%displacements(j)+i) enddo offset_v = offset_v + send_bl(comm_pid+1)%blocksizes(j) enddo endif if (.not. alltoall) then ga_r8_r(rdispls(comm_pid)+1:rdispls(comm_pid)+rcvlths(comm_pid)) = & ga_r8_s(sdispls(comm_pid)+1:sdispls(comm_pid)+sndlths(comm_pid)) endif endif if (alltoall) then call mpi_alltoallv (ga_r8_s, sndlths, sdispls, mp_r8, & ga_r8_r, rcvlths, rdispls, mp_r8, & comm, ierror) endif ! local copy from receive buffer if (rcvlths(comm_pid) .ne. 0) then offset_v = rdispls(comm_pid) do j = 1, recv_bl(comm_pid+1)%Nparcels do i = 1, recv_bl(comm_pid+1)%blocksizes(j) a1out(recv_bl(comm_pid+1)%displacements(j)+i) = ga_r8_r(offset_v+i) enddo offset_v = offset_v + recv_bl(comm_pid+1)%blocksizes(j) enddo ! scatter data for second array (if it exists) if ( present(a2out) ) then offset_v = rdispls(comm_pid) + arr_rcvlths(comm_pid) do j = 1, recv_bl(comm_pid+1)%Nparcels do i = 1, recv_bl(comm_pid+1)%blocksizes(j) a2out(recv_bl(comm_pid+1)%displacements(j)+i) = ga_r8_r(offset_v+i) enddo offset_v = offset_v + recv_bl(comm_pid+1)%blocksizes(j) enddo endif endif ! scatter data from global receive buffer to final destination do istep=1,steps p = swapids(istep) if (.not. alltoall) then if (istep > steps-maxreqh) then call mpi_wait( rcvids(istep), OutStats, ierr ) endif endif if (rcvlths(p) .ne. 0) then offset_v = rdispls(p) do j = 1, recv_bl(p+1)%Nparcels do i = 1, recv_bl(p+1)%blocksizes(j) a1out(recv_bl(p+1)%displacements(j)+i) = ga_r8_r(offset_v+i) enddo offset_v = offset_v + recv_bl(p+1)%blocksizes(j) enddo ! scatter data for second array (if it exists) if ( present(a2out) ) then offset_v = rdispls(p) + arr_rcvlths(p) do j = 1, recv_bl(p+1)%Nparcels do i = 1, recv_bl(p+1)%blocksizes(j) a2out(recv_bl(p+1)%displacements(j)+i) = ga_r8_r(offset_v+i) enddo offset_v = offset_v + recv_bl(p+1)%blocksizes(j) enddo endif endif enddo ! Wait for any outstanding send requests to complete. if (.not. alltoall .and. .not. sendd) then call mpi_waitall( maxreqh, sndids(steps-maxreqh+1), InStats, ierr ) endif ! clean-up ! make used portion of storage window available for reuse r8_win%ncall_s = r8_win%ncall_s - 1 #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_swapirr !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_swapirr_i4 --- Write i4 contiguous parcels to global array ! using XOR swap ordering ! ! !INTERFACE: subroutine mp_swapirr_i4 ( comm, send_bl, recv_bl, a1in, a1out, & 2,2 a2in, a2out, sw_handshake, sw_maxreq, & sw_alltoall, sw_send ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks integer(i4), intent(in) :: a1in(*) ! input array integer(i4), optional, intent(in) :: a2in(*) ! second input array logical, optional, intent(in) :: sw_handshake ! use flow control and ! ready send integer, optional, intent(in) :: sw_maxreq ! maximum number of outstanding ! MPI requests logical, optional, intent(in) :: sw_alltoall ! use mpi_alltoall logical, optional, intent(in) :: sw_send ! use mpi_send instead of isend ! !OUTPUT PARAMETERS: integer(i4), intent(out) :: a1out(*) ! output array integer(i4), optional, intent(out) :: a2out(*) ! second output array ! ! !DESCRIPTION: ! XOR-ordered version of all-to-all communication ! ! WARNING: mod_comm parameter max_irr might need to be set larger than expected ! when swapping two variables; specifically, max_irr must be at least ! as large as the incoming i4_win%ncall_s + the number of variables to ! be swapped ! ! !REVISION HISTORY: ! 08.06.30 Worley original: derived from mp_sendirr, but using ! swapm logic and XOR swap order ! 08.08.22 Worley removed swapm; reimplemented with native MPI, ! added flow control/ready send option and maxreq ! throttling, added alltoall option ! 09.10.07 Worley eliminated mpi_recv from handshake logic ! ! !BUGS: ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer :: i, j, p, istep, num_s, num_r integer :: comm_pid, comm_size, steps, ierr integer :: ipe, offset_s, offset_r, offset_0, unitsize, onetwo integer :: arr_sndlths(0:numpro-1), arr_rcvlths(0:numpro-1) integer :: sndlths(0:numpro-1), sdispls(0:numpro-1) integer :: rcvlths(0:numpro-1), rdispls(0:numpro-1) integer :: swapids(numpro) integer :: sndids(numpro) ! nonblocking MPI send request ids integer :: rcvids(numpro) ! nonblocking MPI recv request ids integer :: hs_rcvids(numpro) ! nonblocking MPI handshake recv request ids integer :: InStats(numpro*MPI_STATUS_SIZE) integer :: OutStats(numpro*MPI_STATUS_SIZE) integer :: offset_v integer :: rstep integer :: maxreq, maxreqh, hs logical :: handshake, alltoall, sendd #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_SIZE (comm, comm_size, ierr) call MPI_COMM_RANK (comm, comm_pid, ierr) ! num_s = 0 if this process is not part of the sending decomposition num_s = size(send_bl) if (send_bl(1)%Nparcels == -1) then num_s = 0 endif ! num_r = 0 if this process is not part of the receiving decomposition num_r = size(recv_bl) if (recv_bl(1)%Nparcels == -1) then num_r = 0 endif if ( present(a2in) .and. (.not. present(a2out)) ) then write(iulog,*) "Fatal mp_swapirr_i4: a2in specified, but a2out missing - exiting" call exit(1) endif if ( (.not. present(a2in)) .and. present(a2out)) then write(iulog,*) "Fatal mp_swapirr_i4: a2out specified, but a2in missing - exiting" call exit(1) endif if ( present(sw_handshake) ) then handshake = sw_handshake else handshake = .false. endif if ( present(sw_alltoall) ) then alltoall = sw_alltoall else alltoall = .false. endif if ( present(sw_send) ) then sendd = sw_send else sendd = .false. endif onetwo = 1 if (present(a2in)) onetwo = 2 unitsize = i4_win%size/max_irr ! advance to unused portion of storage window i4_win%ncall_s = i4_win%ncall_s + 1 if (i4_win%ncall_s .gt. max_irr-onetwo+1) then write(iulog,*) "mp_swapirr_i4: insufficient window storage - exiting" write(iulog,*) "i4_win%ncall_s max_irr = ", i4_win%ncall_s, max_irr call exit(1) endif ! calculate send lengths and displacements offset_0 = (i4_win%ncall_s-1)*unitsize offset_s = offset_0 sndlths(:) = 0 sdispls(:) = 0 arr_sndlths(:) = 0 do ipe=1, num_s sndlths(ipe-1) = send_bl(ipe)%Tot_Size sdispls(ipe-1) = offset_s if (sndlths(ipe-1) .ne. 0) then ! pack first array offset_s = offset_s + sndlths(ipe-1) if (offset_s-offset_0 .gt. onetwo*unitsize) then write(iulog,*) "Fatal mp_swapirr_i4: send window out of space - exiting" write(iulog,*) '1 comm_pid ipe unitsize offset_s offset_0 = ', comm_pid, & ipe, unitsize, offset_s, offset_0 call exit(1) endif arr_sndlths(ipe-1) = sndlths(ipe-1) ! calculate for second array (if it exists) if ( present(a2in) ) then offset_s = offset_s + sndlths(ipe-1) if (offset_s-offset_0 .gt. onetwo*unitsize) then write(iulog,*) "Fatal mp_swapirr_i4: send window out of space - exiting" write(iulog,*) '2 comm_pid ipe unitsize offset_s offset_0 = ', comm_pid, & ipe, unitsize, offset_s, offset_0 call exit(1) endif sndlths(ipe-1) = sndlths(ipe-1) + arr_sndlths(ipe-1) endif endif enddo ! calculate receive lengths and displacements offset_r = offset_0 rcvlths(:) = 0 rdispls(:) = 0 arr_rcvlths(:) = 0 do ipe=1, num_r rcvlths(ipe-1) = recv_bl(ipe)%Tot_Size rdispls(ipe-1) = offset_r if (rcvlths(ipe-1) .ne. 0) then offset_r = offset_r + rcvlths(ipe-1) if (onetwo*unitsize < offset_r-offset_0) then write(iulog,*) "Fatal mp_swapirr_i4: receive window out of space - exiting" write(iulog,*) '1 comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif arr_rcvlths(ipe-1) = rcvlths(ipe-1) ! compute for second array (if it exists) if ( present(a2out) ) then offset_r = offset_r + rcvlths(ipe-1) if (onetwo*unitsize < offset_r-offset_0) then write(iulog,*) "Fatal mp_swapirr_i4: receive window out of space - exiting" write(iulog,*) '2 comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif rcvlths(ipe-1) = rcvlths(ipe-1) + arr_rcvlths(ipe-1) endif endif enddo ! Calculate swap partners and number of steps in point-to-point ! implementations of alltoall algorithm. steps = 0 do ipe=1,ceil2(comm_size)-1 p = pair(comm_size,ipe,comm_pid) if (p >= 0) then if (sndlths(p) > 0 .or. rcvlths(p) > 0) then steps = steps + 1 swapids(steps) = p end if end if end do if (.not. alltoall) then sndids(1:steps) = MPI_REQUEST_NULL rcvids(1:steps) = MPI_REQUEST_NULL if (steps .eq. 0) then maxreq = 0 maxreqh = 0 elseif (steps .eq. 1) then maxreq = 1 maxreqh = 1 else if ( present(sw_maxreq) ) then if ((sw_maxreq .le. steps) .and. (sw_maxreq .ge. 0)) then maxreq = sw_maxreq if (maxreq > 1) then maxreqh = maxreq/2 else maxreq = 2 maxreqh = 1 endif else maxreq = steps maxreqh = steps endif else maxreq = steps maxreqh = steps endif endif ! Post initial handshake receive requests if (handshake) then do istep=1,maxreq p = swapids(istep) if (sndlths(p) > 0) then call mpi_irecv ( hs, 1, mp_i4, p, comm_pid, comm, & hs_rcvids(istep), ierr ) endif enddo endif ! Post initial receive requests do istep=1,maxreq p = swapids(istep) if (rcvlths(p) > 0) then offset_r = rdispls(p)+1 call mpi_irecv ( ga_i4_r(offset_r), rcvlths(p), mp_i4, & p, p, comm, rcvids(istep), ierr ) if (handshake) then call mpi_send( hs, 1, mp_i4, p, p, comm, & ierr ) endif endif enddo rstep = maxreq ! endif ! gather data into global send buffer do istep=1,steps p = swapids(istep) if (sndlths(p) .ne. 0) then offset_v = sdispls(p) do j = 1, send_bl(p+1)%nparcels do i = 1, send_bl(p+1)%blocksizes(j) ga_i4_s(offset_v+i) = a1in(send_bl(p+1)%displacements(j)+i) enddo offset_v = offset_v + send_bl(p+1)%blocksizes(j) enddo ! pack second array (if it exists) if ( present(a2in) ) then offset_v = sdispls(p) + arr_sndlths(p) do j = 1, send_bl(p+1)%nparcels do i = 1, send_bl(p+1)%blocksizes(j) ga_i4_s(offset_v+i) = a2in(send_bl(p+1)%displacements(j)+i) enddo offset_v = offset_v + send_bl(p+1)%blocksizes(j) enddo endif endif if (.not. alltoall) then ! Submit new i(r)send request offset_s = sdispls(p)+1 if (sndlths(p) > 0) then if (handshake) then call mpi_wait( hs_rcvids(istep), Status, ierr ) if (sendd) then call mpi_rsend( ga_i4_s(offset_s), sndlths(p), mp_i4, & p, comm_pid, comm, ierr ) else call mpi_irsend( ga_i4_s(offset_s), sndlths(p), mp_i4, & p, comm_pid, comm, sndids(istep), ierr ) endif else if (sendd) then call mpi_send ( ga_i4_s(offset_s), sndlths(p), mp_i4, & p, comm_pid, comm, ierr ) else call mpi_isend ( ga_i4_s(offset_s), sndlths(p), mp_i4, & p, comm_pid, comm, sndids(istep), ierr ) endif endif endif if (istep > maxreqh) then ! Wait for oldest irecv request to complete call mpi_wait( rcvids(istep-maxreqh), OutStats, ierr ) if (rstep < steps) then rstep = rstep + 1 p = swapids(rstep) ! Submit a new handshake irecv request if (handshake) then if (sndlths(p) > 0) then call mpi_irecv( hs, 1, mp_i4, p, comm_pid, comm, & hs_rcvids(rstep), ierr ) endif endif ! Submit a new irecv request if (rcvlths(p) > 0) then offset_r = rdispls(p)+1 call mpi_irecv( ga_i4_r(offset_r), rcvlths(p), mp_i4, & p, p, comm, rcvids(rstep), ierr ) if (handshake) then call mpi_send ( hs, 1, mp_i4, p, p, comm, & ierr ) endif endif endif ! Wait for outstanding i(r)send request to complete if (.not. sendd) then call mpi_wait( sndids(istep-maxreqh), InStats, ierr ) endif endif ! endif ! enddo ! local copy to send buffer if (sndlths(comm_pid) .ne. 0) then offset_v = sdispls(comm_pid) do j = 1, send_bl(comm_pid+1)%nparcels do i = 1, send_bl(comm_pid+1)%blocksizes(j) ga_i4_s(offset_v+i) = a1in(send_bl(comm_pid+1)%displacements(j)+i) enddo offset_v = offset_v + send_bl(comm_pid+1)%blocksizes(j) enddo ! pack second array (if it exists) if ( present(a2in) ) then offset_v = sdispls(comm_pid) + arr_sndlths(comm_pid) do j = 1, send_bl(comm_pid+1)%nparcels do i = 1, send_bl(comm_pid+1)%blocksizes(j) ga_i4_s(offset_v+i) = a2in(send_bl(comm_pid+1)%displacements(j)+i) enddo offset_v = offset_v + send_bl(comm_pid+1)%blocksizes(j) enddo endif if (.not. alltoall) then ga_i4_r(rdispls(comm_pid)+1:rdispls(comm_pid)+rcvlths(comm_pid)) = & ga_i4_s(sdispls(comm_pid)+1:sdispls(comm_pid)+sndlths(comm_pid)) endif endif if (alltoall) then call mpi_alltoallv (ga_i4_s, sndlths, sdispls, mp_i4, & ga_i4_r, rcvlths, rdispls, mp_i4, & comm, ierror) endif ! local copy from receive buffer if (rcvlths(comm_pid) .ne. 0) then offset_v = rdispls(comm_pid) do j = 1, recv_bl(comm_pid+1)%Nparcels do i = 1, recv_bl(comm_pid+1)%blocksizes(j) a1out(recv_bl(comm_pid+1)%displacements(j)+i) = ga_i4_r(offset_v+i) enddo offset_v = offset_v + recv_bl(comm_pid+1)%blocksizes(j) enddo ! scatter data for second array (if it exists) if ( present(a2out) ) then offset_v = rdispls(comm_pid) + arr_rcvlths(comm_pid) do j = 1, recv_bl(comm_pid+1)%Nparcels do i = 1, recv_bl(comm_pid+1)%blocksizes(j) a2out(recv_bl(comm_pid+1)%displacements(j)+i) = ga_i4_r(offset_v+i) enddo offset_v = offset_v + recv_bl(comm_pid+1)%blocksizes(j) enddo endif endif ! scatter data from global receive buffer to final destination do istep=1,steps p = swapids(istep) if (.not. alltoall) then if (istep > steps-maxreqh) then call mpi_wait( rcvids(istep), OutStats, ierr ) endif endif if (rcvlths(p) .ne. 0) then offset_v = rdispls(p) do j = 1, recv_bl(p+1)%Nparcels do i = 1, recv_bl(p+1)%blocksizes(j) a1out(recv_bl(p+1)%displacements(j)+i) = ga_i4_r(offset_v+i) enddo offset_v = offset_v + recv_bl(p+1)%blocksizes(j) enddo ! scatter data for second array (if it exists) if ( present(a2out) ) then offset_v = rdispls(p) + arr_rcvlths(p) do j = 1, recv_bl(p+1)%Nparcels do i = 1, recv_bl(p+1)%blocksizes(j) a2out(recv_bl(p+1)%displacements(j)+i) = ga_i4_r(offset_v+i) enddo offset_v = offset_v + recv_bl(p+1)%blocksizes(j) enddo endif endif enddo ! Wait for any outstanding send requests to complete. if (.not. alltoall .and. .not. sendd) then call mpi_waitall( maxreqh, sndids(steps-maxreqh+1), InStats, ierr ) endif ! clean-up ! make used portion of storage window available for reuse i4_win%ncall_s = i4_win%ncall_s - 1 #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_swapirr_i4 !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: pair ! ! !INTERFACE: integer function pair(np,p,k) 6 ! ! !INPUT PARAMETERS: integer :: np integer :: p integer :: k ! !DESCRIPTION: ! ! Bitwise XOR of arguments p and k, if less than upper bound np ! ! !REVISION HISTORY: ! 2008.08.21 Worley Imported from spmdutils ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer q ! q = ieor(p,k) if ( q > np-1 ) then pair = -1 else pair = q endif return !EOC end function pair !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: ceil2 ! ! !INTERFACE: integer function ceil2(n) 14 ! ! !INPUT PARAMETERS: integer :: n ! !DESCRIPTION: ! ! Smallest power of 2 greater than or equal to the argument ! ! !REVISION HISTORY: ! 2008.08.21 Worley Imported from spmdutils ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer p p=1 do while ( p < n ) p=p*2 enddo ceil2=p return !EOC end function ceil2 !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ # if defined( MOD_ASSUMED_SIZE ) !BOP ! !ROUTINE: mp_sendtrirr --- Initiate communication of contiguous tracer parcels ! ! !INTERFACE: subroutine mp_sendtrirr ( comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq, & 2,4 ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts, & ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr, & modc ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks integer, intent(in) :: mbeg ! initial tracer index integer, intent(in) :: mend ! final tracer index integer, intent(in) :: mq ! total tracer indices integer, intent(in) :: ifirsts ! first I index of source integer, intent(in) :: ilasts ! last I index of source integer, intent(in) :: jfirsts ! first j index of source integer, intent(in) :: jlasts ! last j index of source integer, intent(in) :: kfirsts ! first k index of source integer, intent(in) :: klasts ! last k index of source integer, intent(in) :: ifirstr ! first I index of target integer, intent(in) :: ilastr ! last I index of target integer, intent(in) :: jfirstr ! first j index of target integer, intent(in) :: jlastr ! last j index of target integer, intent(in) :: kfirstr ! first k index of target integer, intent(in) :: klastr ! last k index of target integer, optional, intent(in) :: modc(4) ! 1: classical, swap p2p, swap a2a ! 2: handshake ! 3: send vs isend ! 4: max number of outstanding requests real(r8), intent(in) :: qin(*) ! input tracer array ! !OUTPUT PARAMETERS: real(r8), intent(out) :: qout(*) ! output tracer array ! ! !DESCRIPTION: ! Communicate a number of contiguous parcels to/from arbitrary set of PEs. ! Modc(1): if 0, use original approach of posting all communications here and placing ! wait points in mp_recvtrirr; if 1, call swap routine with p2p messages; if 2, call swap ! routine with a2a messages. ! Modc(2): if 1, then apply handshaking (don't send until corresponding receive is posted) ! Modc(3): if 1, then use blocking send; otherwise use nonblocking send ! Modc(4): maximum number of outstanding requests (applies to swap routines only) ! ! !REVISION HISTORY: ! 02.08.13 Sawyer Creation ! 02.11.06 Mirin Optimizations ! 03.03.03 Sawyer Use partneroffset ! 03.06.24 Sawyer Integrated Use_Mpi_Types; added qout ! 04.02.24 Mirin Various mpi2 options ! 08.09.18 Mirin Major overhaul, to include approaches from Mirin and Worley ! 09.10.07 Worley eliminated mpi_recv from handshake logic ! ! !BUGS: ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer ipe, qsize, offset, blocksize, nparcels, offset_s, offset_r, ierr, mod_method integer p, mysize, nthpc, minsize, nthrd, pn, pt, tmpsize, unitsize, offset_0 integer i, j, send_tag, recv_tag, num_s, num_r, m integer :: offset_v (Max_Nparcels) integer :: hs_rcvids(numpro) integer signal integer ipe2, ceil2num integer numtr, numtrm integer sw_local, maxreq_local logical hs_local, send_local logical sw_alltoall integer comm_pid integer ijks, ijkr, ij #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif if (present(modc)) then sw_local = modc(1) hs_local = (modc(2) .eq. 1) send_local = (modc(3) .eq. 1) maxreq_local = modc(4) else sw_local = 0 hs_local = .true. send_local = .true. maxreq_local = -1 endif ! Do not call mp_swaptrirr unless mod_method equals 0 mod_method = recv_bl(1)%method if (mod_method .gt. 0) sw_local = 0 if (sw_local .gt. 0) then sw_alltoall = (sw_local .eq. 2) call mp_swaptrirr(comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq, & ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts, & ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr, & sw_handshake=hs_local, sw_maxreq=maxreq_local, & sw_alltoall=sw_alltoall, sw_send=send_local) else call MPI_COMM_RANK (comm, comm_pid, ierr) signal = 1 ceil2num = ceil2(numpro) numtrm = mend - mbeg numtr = numtrm + 1 ! num_s = 0 if this processes is not part of the sending decomposition num_s = size(send_bl) if (send_bl(1)%Nparcels == -1) then num_s = 0 endif ! num_r = 0 if this processes is not part of the receiving decomposition num_r = size(recv_bl) if (recv_bl(1)%Nparcels == -1) then num_r = 0 endif r8_win%ncall_s = r8_win%ncall_s + 1 ijks =(klasts-kfirsts+1)*(jlasts-jfirsts+1)*(ilasts-ifirsts+1) ijkr =(klastr-kfirstr+1)*(jlastr-jfirstr+1)*(ilastr-ifirstr+1) if (mod_method .gt. 0) then ! ! mpi derived types if (r8_win%ncall_s .gt. MaxTrf-numtrm) then write(iulog,*) "mp_sendtrirr: derived type handle count exceeded - exiting" write(iulog,*) "r8_win%ncall_s MaxTrf = ", r8_win%ncall_s, MaxTrf call exit(1) endif ! ! MPI: Irecv over all processes ! if (hs_local) then hs_rcvids(:) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then if (ipe-1 /= comm_pid) & call MPI_IRECV ( signal, 1, mp_i4, ipe-1, comm_pid, comm, hs_rcvids(ipe), ierr ) endif enddo endif OutHandle(:,r8_win%ncall_s:r8_win%ncall_s+numtrm) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_r) cycle ! ! Receive the buffers with MPI_Irecv. Non-blocking ! if ( recv_bl(ipe)%type /= MPI_DATATYPE_NULL ) then recv_tag = ipe-1 + modcam_tagoffset do m = mbeg, mend call mpi_irecv( qout((m-1)*ijkr+1), 1, recv_bl(ipe)%type, ipe-1, recv_tag, & comm, OutHandle(ipe,r8_win%ncall_s+m-mbeg), ierr ) enddo if (hs_local) then if (ipe-1 /= comm_pid) & call MPI_SEND ( signal, 1, mp_i4, ipe-1, ipe-1, comm, ierr ) endif endif enddo ! ! MPI: Isend/Send over all processes; use risend/rsend with hs ! InHandle(:,r8_win%ncall_s:r8_win%ncall_s+numtrm) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle ! ! Send the individual buffers with non-blocking sends ! if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then send_tag = comm_pid + modcam_tagoffset if (hs_local) then if (ipe-1 /= comm_pid) & call MPI_WAIT ( hs_rcvids(ipe), Status, ierr ) if (send_local) then do m = mbeg, mend call mpi_rsend( qin((m-1)*ijks+1), 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, ierr ) enddo else do m = mbeg, mend call mpi_irsend( qin((m-1)*ijks+1), 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, InHandle(ipe,r8_win%ncall_s), ierr ) enddo endif else if (send_local) then do m = mbeg, mend call mpi_send( qin((m-1)*ijks+1), 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, ierr ) enddo else do m = mbeg, mend call mpi_isend( qin((m-1)*ijks+1), 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, InHandle(ipe,r8_win%ncall_s), ierr ) enddo endif endif endif enddo else ! temporary contiguous buffers if (r8_win%ncall_s .gt. max_irr-numtrm) then write(iulog,*) "mp_sendtrirr: insufficient window storage - exiting" write(iulog,*) "r8_win%ncall_s max_irr = ", r8_win%ncall_s, max_irr call exit(1) endif unitsize = r8_win%size/max_irr ! issue call to receive data in global receive buffer offset_0 = (r8_win%ncall_s-1)*unitsize offset_s = offset_0 offset_r = offset_0 if (hs_local) then hs_rcvids(:) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle qsize = numtr*send_bl(ipe)%Tot_Size if (qsize .ne. 0) then r8_win%dest = ipe-1 send_tag = comm_pid + modcam_tagoffset if (r8_win%dest /= comm_pid) & call MPI_IRECV ( signal, 1, mp_i4, r8_win%dest, send_tag, comm, hs_rcvids(ipe), ierr ) endif enddo endif do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_r) cycle r8_win%size_r = numtr*recv_bl(ipe)%Tot_Size if (r8_win%size_r .ne. 0) then r8_win%offset_r = offset_r offset_r = offset_r + r8_win%size_r r8_win%src = ipe-1 if (numtr*unitsize >= offset_r-offset_0) then recv_tag = r8_win%src + modcam_tagoffset qsize = r8_win%size_r r8_win%nrecv = r8_win%nrecv + 1 call MPI_IRECV(ga_r8_r(r8_win%offset_r+1), qsize, mp_r8, r8_win%src, & recv_tag, comm, r8_win%rqest(r8_win%nrecv), ierror) if (hs_local) then if (r8_win%src /= comm_pid) & call MPI_SEND ( signal, 1, mp_i4, r8_win%src, recv_tag, comm, ierror) endif else write(iulog,*) "Fatal mp_sendtrirr: receive window out of space - exiting" write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif endif enddo ! gather data into global send buffer do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle qsize = numtr*send_bl(ipe)%Tot_Size if (qsize .ne. 0) then r8_win%dest = ipe-1 r8_win%offset_s = offset_s offset_s = offset_s + qsize if (offset_s-offset_0 .gt. numtr*unitsize) then write(iulog,*) "Fatal mp_sendtrirr: send window out of space - exiting" write(iulog,*) 'comm_pid ipe unitsize offset_s offset_0 = ', comm_pid, & ipe, unitsize, offset_s, offset_0 call exit(1) endif offset_v(1) = r8_win%offset_s do j = 2, send_bl(ipe)%nparcels offset_v(j) = offset_v(j-1) + send_bl(ipe)%blocksizes(j-1) enddo do j = 1, send_bl(ipe)%nparcels do m = mbeg, mend do i = 1, send_bl(ipe)%blocksizes(j) ij = send_bl(ipe)%displacements(j)+i ga_r8_s(send_bl(ipe)%Tot_Size*(m-mbeg)+offset_v(j)+i) = qin((m-1)*ijks+ij) enddo enddo enddo ! nonblocking send send_tag = comm_pid + modcam_tagoffset r8_win%nsend = r8_win%nsend + 1 if (hs_local) then if (r8_win%dest /= comm_pid) & call MPI_WAIT ( hs_rcvids(ipe), Status, ierr ) if (send_local) then call MPI_RSEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, & send_tag, comm, ierr) else call MPI_IRSEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, & send_tag, comm, r8_win%sqest(r8_win%nsend), ierr) endif else if (send_local) then call MPI_SEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, & send_tag, comm, ierr) else call MPI_ISEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, & send_tag, comm, r8_win%sqest(r8_win%nsend), ierr) endif endif endif enddo endif ! mod_method r8_win%ncall_s = r8_win%ncall_s + numtrm endif ! sw_local #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif end subroutine mp_sendtrirr !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_recvtrirr --- Finalize communication of contiguous tracer parcels ! ! !INTERFACE: subroutine mp_recvtrirr ( comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq, & 2,2 ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts, & ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr, & modc ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks integer, intent(in) :: mbeg ! initial tracer index integer, intent(in) :: mend ! final tracer index integer, intent(in) :: mq ! total tracer indices integer, intent(in) :: ifirsts ! first I index of source integer, intent(in) :: ilasts ! last I index of source integer, intent(in) :: jfirsts ! first j index of source integer, intent(in) :: jlasts ! last j index of source integer, intent(in) :: kfirsts ! first k index of source integer, intent(in) :: klasts ! last k index of source integer, intent(in) :: ifirstr ! first I index of target integer, intent(in) :: ilastr ! last I index of target integer, intent(in) :: jfirstr ! first j index of target integer, intent(in) :: jlastr ! last j index of target integer, intent(in) :: kfirstr ! first k index of target integer, intent(in) :: klastr ! last k index of target integer, optional, intent(in) :: modc(4) ! 1: classical, swap p2p, swap a2a ! 2: handshake ! 3: send vs isend ! 4: max number of outstanding requests real(r8), intent(in) :: qin(*) ! input tracer array ! !OUTPUT PARAMETERS: real(r8), intent(out) :: qout(*) ! output tracer array ! ! !DESCRIPTION: ! Complete transfer of a generalized region initiated by {\tt mp\_sendtrirr}. ! Communicate a number of contiguous parcels to/from arbitrary set of PEs. ! Modc(1): if 0, use original approach of posting all communications in mp_sendtrirr and ! placing wait points here; otherwise don't do anything - mp_swaptrirr is called from mp_sendirr. ! Modc(3): if 1, then use blocking send; otherwise use nonblocking send ! ! !REVISION HISTORY: ! 02.08.15 Sawyer Creation ! 02.11.06 Mirin Optimizations ! 03.03.03 Sawyer Now using packed arrays for MPI2 ! 04.02.24 Mirin Various mpi2 options ! 08.09.18 Mirin Major overhaul, to include approaches from Mirin and Worley ! !EOP !------------------------------------------------------------------------------ !BOC integer :: ipe, blocksize, offset_r, mod_method integer unitsize, offset_0 integer Ierr integer InStats(numpro*MPI_STATUS_SIZE) integer OutStats(numpro*MPI_STATUS_SIZE) integer i, j, num_r, num_s, m integer :: offset_v (Max_Nparcels) integer ipe2, ceil2num integer numtr, numtrm integer sw_local, maxreq_local logical hs_local, send_local logical sw_alltoall integer comm_size, comm_pid integer ijks, ijkr, ij if (present(modc)) then sw_local = modc(1) hs_local = (modc(2) .eq. 1) send_local = (modc(3) .eq. 1) maxreq_local = modc(4) else sw_local = 0 hs_local = .true. send_local = .true. maxreq_local = -1 endif ! Do not call mp_swaptrirr (hence return) unless mod_method equals 0 mod_method = recv_bl(1)%method if (mod_method .gt. 0) sw_local = 0 ! Return if swap_irr if (sw_local .gt. 0) return #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_SIZE (comm, comm_size, ierr) call MPI_COMM_RANK (comm, comm_pid, ierr) ceil2num = ceil2(numpro) numtrm = mend - mbeg numtr = numtrm + 1 ! num_s = 0 if this processes is not part of the sending decomposition num_s = size(send_bl) if (send_bl(1)%Nparcels == -1) then num_s = 0 endif ! num_r = 0 if this processes is not part of the receiving decomposition num_r = size(recv_bl) if (recv_bl(1)%Nparcels == -1) then num_r = 0 endif r8_win%ncall_r = r8_win%ncall_r + 1 ijks =(klasts-kfirsts+1)*(jlasts-jfirsts+1)*(ilasts-ifirsts+1) ijkr =(klastr-kfirstr+1)*(jlastr-jfirstr+1)*(ilastr-ifirstr+1) if (mod_method .gt. 0) then ! mpi derived types if (r8_win%ncall_r .gt. MaxTrf-numtrm) then write(iulog,*) "mp_recvtrirr: derived type handle count exceeded - exiting" write(iulog,*) "r8_win%ncall_r MaxTrf = ", r8_win%ncall_r, MaxTrf call exit(1) endif if (num_s .gt. 0 .and. (.not. send_local)) then do m = mbeg, mend CALL MPI_WAITALL( comm_size, InHandle(:,r8_win%ncall_r+m-mbeg), InStats, Ierr ) enddo endif if (num_r .gt. 0) then do m = mbeg, mend CALL MPI_WAITALL( comm_size, OutHandle(:,r8_win%ncall_r+m-mbeg), OutStats, Ierr ) enddo endif else ! temporary contiguous buffer / global window if (r8_win%ncall_r .gt. max_irr-numtrm) then write(iulog,*) "mp_recvtrirr: insufficient window storage - exiting" write(iulog,*) "r8_win%ncall_r max_irr = ", r8_win%ncall_r, max_irr call exit(1) endif unitsize = r8_win%size/max_irr ! scatter data from global receive buffer to final destination offset_0 = (r8_win%ncall_r-1)*unitsize offset_r = offset_0 do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_r) cycle r8_win%size_r = numtr*recv_bl(ipe)%Tot_Size if (r8_win%size_r .ne. 0) then r8_win%offset_r = offset_r offset_r = offset_r + r8_win%size_r if (offset_r-offset_0 .gt. numtr*unitsize) then write(iulog,*) "Fatal mp_recvtrirr: receive window out of space - exiting" write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif r8_win%nread = r8_win%nread + 1 call MPI_WAIT(r8_win%rqest(r8_win%nread), Status, ierr) offset_v(1) = r8_win%offset_r do j = 2, recv_bl(ipe)%Nparcels offset_v(j) = offset_v(j-1) + recv_bl(ipe)%blocksizes(j-1) enddo do j = 1, recv_bl(ipe)%Nparcels do m = mbeg, mend do i = 1, recv_bl(ipe)%blocksizes(j) ij = recv_bl(ipe)%displacements(j)+i qout((m-1)*ijkr+ij) = ga_r8_r(recv_bl(ipe)%Tot_Size*(m-mbeg)+offset_v(j)+i) enddo enddo enddo endif enddo if ((r8_win%ncall_s == r8_win%ncall_r + numtrm) .and. (.not. send_local)) then call MPI_WAITALL(r8_win%nsend, r8_win%sqest, Stats, ierror) endif endif ! mod_method .gt. 0 r8_win%ncall_r = r8_win%ncall_r + numtrm if (r8_win%ncall_s == r8_win%ncall_r) then r8_win%nsend = 0 r8_win%nrecv = 0 r8_win%nread = 0 r8_win%ncall_s = 0 r8_win%ncall_r = 0 endif #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_recvtrirr !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_swaptrirr --- Write r8 contiguous parcels to global array ! using XOR swap ordering - for multiple tracers ! ! !INTERFACE: subroutine mp_swaptrirr ( comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq, & 2,4 ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts, & ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr, & sw_handshake, sw_maxreq, sw_alltoall, sw_send ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks integer, intent(in) :: mbeg ! initial tracer index integer, intent(in) :: mend ! final tracer index integer, intent(in) :: mq ! total tracer indices integer, intent(in) :: ifirsts ! first I index of source integer, intent(in) :: ilasts ! last I index of source integer, intent(in) :: jfirsts ! first j index of source integer, intent(in) :: jlasts ! last j index of source integer, intent(in) :: kfirsts ! first k index of source integer, intent(in) :: klasts ! last k index of source integer, intent(in) :: ifirstr ! first I index of target integer, intent(in) :: ilastr ! last I index of target integer, intent(in) :: jfirstr ! first j index of target integer, intent(in) :: jlastr ! last j index of target integer, intent(in) :: kfirstr ! first k index of target integer, intent(in) :: klastr ! last k index of target logical, optional, intent(in) :: sw_handshake ! use flow control and ! ready send integer, optional, intent(in) :: sw_maxreq ! maximum number of outstanding ! MPI requests logical, optional, intent(in) :: sw_alltoall ! use mpi_alltoall logical, optional, intent(in) :: sw_send ! use mpi_send instead of isend real(r8), intent(in) :: qin(*) ! input tracer array ! !OUTPUT PARAMETERS: real(r8), intent(out) :: qout(*) ! output tracer array ! ! !DESCRIPTION: ! ! XOR-ordered version of all-to-all communication ! ! WARNING: mod_comm parameter max_irr might need to be set larger than expected ! when swapping multiple variables; specifically, max_irr must be at least ! as large as the incoming r8_win%ncall_s + the number of variables to ! be swapped ! ! !REVISION HISTORY: ! 08.06.30 Worley original: derived from mp_sendirr, but using ! swapm logic and XOR swap order ! 08.08.22 Worley removed swapm; reimplemented with native MPI, ! added flow control/ready send option and maxreq ! throttling, added alltoall option ! 09.10.07 Worley eliminated mpi_recv from handshake logic ! ! !BUGS: ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer :: i, j, p, istep, num_s, num_r integer :: comm_pid, comm_size, steps, ierr integer :: ipe, offset_s, offset_r, offset_0, unitsize integer :: sndlths(0:numpro-1), sdispls(0:numpro-1) integer :: rcvlths(0:numpro-1), rdispls(0:numpro-1) integer :: swapids(numpro) integer :: sndids(numpro) ! nonblocking MPI send request ids integer :: rcvids(numpro) ! nonblocking MPI recv request ids integer :: hs_rcvids(numpro) ! nonblocking MPI handshake recv request ids integer :: InStats(numpro*MPI_STATUS_SIZE) integer :: OutStats(numpro*MPI_STATUS_SIZE) integer :: offset_v integer :: rstep integer :: maxreq, maxreqh, hs logical :: handshake, alltoall, sendd integer :: numtr, numtrm, m integer ijks, ijkr, ij #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_SIZE (comm, comm_size, ierr) call MPI_COMM_RANK (comm, comm_pid, ierr) ! num_s = 0 if this process is not part of the sending decomposition num_s = size(send_bl) if (send_bl(1)%Nparcels == -1) then num_s = 0 endif ! num_r = 0 if this process is not part of the receiving decomposition num_r = size(recv_bl) if (recv_bl(1)%Nparcels == -1) then num_r = 0 endif if ( present(sw_handshake) ) then handshake = sw_handshake else handshake = .false. endif if ( present(sw_alltoall) ) then alltoall = sw_alltoall else alltoall = .false. endif if ( present(sw_send) ) then sendd = sw_send else sendd = .false. endif numtrm = mend - mbeg numtr = numtrm + 1 ijks =(klasts-kfirsts+1)*(jlasts-jfirsts+1)*(ilasts-ifirsts+1) ijkr =(klastr-kfirstr+1)*(jlastr-jfirstr+1)*(ilastr-ifirstr+1) unitsize = r8_win%size/max_irr ! advance to unused portion of storage window r8_win%ncall_s = r8_win%ncall_s + 1 if (r8_win%ncall_s .gt. max_irr-numtrm) then write(iulog,*) "mp_swaptrirr: insufficient window storage - exiting" write(iulog,*) "r8_win%ncall_s max_irr = ", r8_win%ncall_s, max_irr call exit(1) endif ! calculate send lengths and displacements offset_0 = (r8_win%ncall_s-1)*unitsize offset_s = offset_0 sndlths(:) = 0 sdispls(:) = 0 do ipe=1, num_s sndlths(ipe-1) = numtr*send_bl(ipe)%Tot_Size sdispls(ipe-1) = offset_s if (sndlths(ipe-1) .ne. 0) then offset_s = offset_s + sndlths(ipe-1) if (offset_s-offset_0 .gt. numtr*unitsize) then write(iulog,*) "Fatal mp_swaptrirr: send window out of space - exiting" write(iulog,*) '1 comm_pid ipe unitsize offset_s offset_0 = ', comm_pid, & ipe, unitsize, offset_s, offset_0 call exit(1) endif endif enddo ! calculate receive lengths and displacements offset_r = offset_0 rcvlths(:) = 0 rdispls(:) = 0 do ipe=1, num_r rcvlths(ipe-1) = numtr*recv_bl(ipe)%Tot_Size rdispls(ipe-1) = offset_r if (rcvlths(ipe-1) .ne. 0) then offset_r = offset_r + rcvlths(ipe-1) if (numtr*unitsize < offset_r-offset_0) then write(iulog,*) "Fatal mp_swaptrirr: receive window out of space - exiting" write(iulog,*) '1 comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif endif enddo ! Calculate swap partners and number of steps in point-to-point ! implementations of alltoall algorithm. steps = 0 do ipe=1,ceil2(comm_size)-1 p = pair(comm_size,ipe,comm_pid) if (p >= 0) then if (sndlths(p) > 0 .or. rcvlths(p) > 0) then steps = steps + 1 swapids(steps) = p end if end if end do if (.not. alltoall) then sndids(1:steps) = MPI_REQUEST_NULL rcvids(1:steps) = MPI_REQUEST_NULL if (steps .eq. 0) then maxreq = 0 maxreqh = 0 elseif (steps .eq. 1) then maxreq = 1 maxreqh = 1 else if ( present(sw_maxreq) ) then if ((sw_maxreq .le. steps) .and. (sw_maxreq .ge. 0)) then maxreq = sw_maxreq if (maxreq > 1) then maxreqh = maxreq/2 else maxreq = 2 maxreqh = 1 endif else maxreq = steps maxreqh = steps endif else maxreq = steps maxreqh = steps endif endif ! Post initial handshake receive requests if (handshake) then do istep=1,maxreq p = swapids(istep) if (sndlths(p) > 0) then call mpi_irecv ( hs, 1, mp_i4, p, comm_pid, comm, & hs_rcvids(istep), ierr ) endif enddo endif ! Post initial receive requests do istep=1,maxreq p = swapids(istep) if (rcvlths(p) > 0) then offset_r = rdispls(p)+1 call mpi_irecv ( ga_r8_r(offset_r), rcvlths(p), mp_r8, & p, p, comm, rcvids(istep), ierr ) if (handshake) then call mpi_send( hs, 1, mp_i4, p, p, comm, & ierr ) endif endif enddo rstep = maxreq ! endif ! gather data into global send buffer do istep=1,steps p = swapids(istep) if (sndlths(p) .ne. 0) then offset_v = sdispls(p) do j = 1, send_bl(p+1)%nparcels do m = mbeg, mend do i = 1, send_bl(p+1)%blocksizes(j) ij = send_bl(p+1)%displacements(j)+i ga_r8_s(send_bl(p+1)%Tot_Size*(m-mbeg)+offset_v+i) = qin((m-1)*ijks+ij) enddo enddo offset_v = offset_v + send_bl(p+1)%blocksizes(j) enddo endif if (.not. alltoall) then ! Submit new i(r)send request offset_s = sdispls(p)+1 if (sndlths(p) > 0) then if (handshake) then call mpi_wait( hs_rcvids(istep), Status, ierr ) if (sendd) then call mpi_rsend( ga_r8_s(offset_s), sndlths(p), mp_r8, & p, comm_pid, comm, ierr ) else call mpi_irsend( ga_r8_s(offset_s), sndlths(p), mp_r8, & p, comm_pid, comm, sndids(istep), ierr ) endif else if (sendd) then call mpi_send ( ga_r8_s(offset_s), sndlths(p), mp_r8, & p, comm_pid, comm, ierr ) else call mpi_isend ( ga_r8_s(offset_s), sndlths(p), mp_r8, & p, comm_pid, comm, sndids(istep), ierr ) endif endif endif if (istep > maxreqh) then ! Wait for oldest irecv request to complete call mpi_wait( rcvids(istep-maxreqh), OutStats, ierr ) if (rstep < steps) then rstep = rstep + 1 p = swapids(rstep) ! Submit a new handshake irecv request if (handshake) then if (sndlths(p) > 0) then call mpi_irecv( hs, 1, mp_i4, p, comm_pid, comm, & hs_rcvids(rstep), ierr ) endif endif ! Submit a new irecv request if (rcvlths(p) > 0) then offset_r = rdispls(p)+1 call mpi_irecv( ga_r8_r(offset_r), rcvlths(p), mp_r8, & p, p, comm, rcvids(rstep), ierr ) if (handshake) then call mpi_send ( hs, 1, mp_i4, p, p, comm, & ierr ) endif endif endif ! Wait for outstanding i(r)send request to complete if (.not. sendd) then call mpi_wait( sndids(istep-maxreqh), InStats, ierr ) endif endif ! endif ! enddo ! local copy to send buffer if (sndlths(comm_pid) .ne. 0) then offset_v = sdispls(comm_pid) do j = 1, send_bl(comm_pid+1)%nparcels do m = mbeg, mend do i = 1, send_bl(comm_pid+1)%blocksizes(j) ij = send_bl(comm_pid+1)%displacements(j)+i ga_r8_s(send_bl(comm_pid+1)%Tot_Size*(m-mbeg)+offset_v+i) = qin((m-1)*ijks+ij) enddo enddo offset_v = offset_v + send_bl(comm_pid+1)%blocksizes(j) enddo if (.not. alltoall) then ga_r8_r(rdispls(comm_pid)+1:rdispls(comm_pid)+rcvlths(comm_pid)) = & ga_r8_s(sdispls(comm_pid)+1:sdispls(comm_pid)+sndlths(comm_pid)) endif endif if (alltoall) then call mpi_alltoallv (ga_r8_s, sndlths, sdispls, mp_r8, & ga_r8_r, rcvlths, rdispls, mp_r8, & comm, ierror) endif ! local copy from receive buffer if (rcvlths(comm_pid) .ne. 0) then offset_v = rdispls(comm_pid) do j = 1, recv_bl(comm_pid+1)%Nparcels do m = mbeg, mend do i = 1, recv_bl(comm_pid+1)%blocksizes(j) ij = recv_bl(comm_pid+1)%displacements(j)+i qout((m-1)*ijkr+ij) = ga_r8_r(recv_bl(comm_pid+1)%Tot_Size*(m-mbeg)+offset_v+i) enddo enddo offset_v = offset_v + recv_bl(comm_pid+1)%blocksizes(j) enddo endif ! scatter data from global receive buffer to final destination do istep=1,steps p = swapids(istep) if (.not. alltoall) then if (istep > steps-maxreqh) then call mpi_wait( rcvids(istep), OutStats, ierr ) endif endif if (rcvlths(p) .ne. 0) then offset_v = rdispls(p) do j = 1, recv_bl(p+1)%Nparcels do m = mbeg, mend do i = 1, recv_bl(p+1)%blocksizes(j) ij = recv_bl(p+1)%displacements(j)+i qout((m-1)*ijkr+ij) = ga_r8_r(recv_bl(p+1)%Tot_Size*(m-mbeg)+offset_v+i) enddo enddo offset_v = offset_v + recv_bl(p+1)%blocksizes(j) enddo endif enddo ! Wait for any outstanding send requests to complete. if (.not. alltoall .and. .not. sendd) then call mpi_waitall( maxreqh, sndids(steps-maxreqh+1), InStats, ierr ) endif ! clean-up ! make used portion of storage window available for reuse r8_win%ncall_s = r8_win%ncall_s - 1 #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_swaptrirr # endif !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ # if defined( MOD_SPECIFIED_SHAPE ) !BOP ! !ROUTINE: mp_sendtrirr --- Initiate communication of contiguous tracer parcels ! ! !INTERFACE: subroutine mp_sendtrirr ( comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq, & 2,4 ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts, & ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr, & modc ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks integer, intent(in) :: mbeg ! initial tracer index integer, intent(in) :: mend ! final tracer index integer, intent(in) :: mq ! total tracer indices integer, intent(in) :: ifirsts ! first I index of source integer, intent(in) :: ilasts ! last I index of source integer, intent(in) :: jfirsts ! first j index of source integer, intent(in) :: jlasts ! last j index of source integer, intent(in) :: kfirsts ! first k index of source integer, intent(in) :: klasts ! last k index of source integer, intent(in) :: ifirstr ! first I index of target integer, intent(in) :: ilastr ! last I index of target integer, intent(in) :: jfirstr ! first j index of target integer, intent(in) :: jlastr ! last j index of target integer, intent(in) :: kfirstr ! first k index of target integer, intent(in) :: klastr ! last k index of target integer, optional, intent(in) :: modc(4) ! 1: classical, swap p2p, swap a2a ! 2: handshake ! 3: send vs isend ! 4: max number of outstanding requests real(r8), intent(in) :: qin(ifirsts:ilasts,jfirsts:jlasts,kfirsts:klasts,1:mq) ! input tracer array ! !OUTPUT PARAMETERS: real(r8), intent(out) :: qout(ifirstr:ilastr,jfirstr:jlastr,kfirstr:klastr,1:mq) ! output tracer array ! ! !DESCRIPTION: ! Communicate a number of contiguous parcels to/from arbitrary set of PEs. ! Modc(1): if 0, use original approach of posting all communications here and placing ! wait points in mp_recvtrirr; if 1, call swap routine with p2p messages; if 2, call swap ! routine with a2a messages. ! Modc(2): if 1, then apply handshaking (don't send until corresponding receive is posted) ! Modc(3): if 1, then use blocking send; otherwise use nonblocking send ! Modc(4): maximum number of outstanding requests (applies to swap routines only) ! ! !REVISION HISTORY: ! 02.08.13 Sawyer Creation ! 02.11.06 Mirin Optimizations ! 03.03.03 Sawyer Use partneroffset ! 03.06.24 Sawyer Integrated Use_Mpi_Types; added qout ! 04.02.24 Mirin Various mpi2 options ! 08.09.18 Mirin Major overhaul, to include approaches from Mirin and Worley ! 09.10.07 Worley eliminated mpi_recv from handshake logic ! ! !BUGS: ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer ipe, qsize, offset, blocksize, nparcels, offset_s, offset_r, ierr, mod_method integer p, mysize, nthpc, minsize, nthrd, pn, pt, tmpsize, unitsize, offset_0 integer i, j, send_tag, recv_tag, num_s, num_r, m integer :: offset_v (Max_Nparcels) integer :: hs_rcvids(numpro) integer signal integer ipe2, ceil2num integer numtr, numtrm integer sw_local, maxreq_local logical hs_local, send_local logical sw_alltoall integer comm_pid integer ip, jp, kp, mp, ir, jr, jir, mt #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif if (present(modc)) then sw_local = modc(1) hs_local = (modc(2) .eq. 1) send_local = (modc(3) .eq. 1) maxreq_local = modc(4) else sw_local = 0 hs_local = .true. send_local = .true. maxreq_local = -1 endif ! Do not call mp_swaptrirr unless mod_method equals 0 mod_method = recv_bl(1)%method if (mod_method .gt. 0) sw_local = 0 if (sw_local .gt. 0) then sw_alltoall = (sw_local .eq. 2) call mp_swaptrirr(comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq, & ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts, & ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr, & sw_handshake=hs_local, sw_maxreq=maxreq_local, & sw_alltoall=sw_alltoall, sw_send=send_local) else call MPI_COMM_RANK (comm, comm_pid, ierr) signal = 1 ceil2num = ceil2(numpro) numtrm = mend - mbeg numtr = numtrm + 1 ! num_s = 0 if this processes is not part of the sending decomposition num_s = size(send_bl) if (send_bl(1)%Nparcels == -1) then num_s = 0 endif ! num_r = 0 if this processes is not part of the receiving decomposition num_r = size(recv_bl) if (recv_bl(1)%Nparcels == -1) then num_r = 0 endif r8_win%ncall_s = r8_win%ncall_s + 1 if (mod_method .gt. 0) then ! ! mpi derived types if (r8_win%ncall_s .gt. MaxTrf-numtrm) then write(iulog,*) "mp_sendtrirr: derived type handle count exceeded - exiting" write(iulog,*) "r8_win%ncall_s MaxTrf = ", r8_win%ncall_s, MaxTrf call exit(1) endif ! ! MPI: Irecv over all processes ! if (hs_local) then hs_rcvids(:) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then if (ipe-1 /= comm_pid) & call MPI_IRECV ( signal, 1, mp_i4, ipe-1, comm_pid, comm, hs_rcvids(ipe), ierr ) endif enddo endif OutHandle(:,r8_win%ncall_s:r8_win%ncall_s+numtrm) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_r) cycle ! ! Receive the buffers with MPI_Irecv. Non-blocking ! if ( recv_bl(ipe)%type /= MPI_DATATYPE_NULL ) then recv_tag = ipe-1 + modcam_tagoffset do m = mbeg, mend call mpi_irecv( qout(:,:,:,m), 1, recv_bl(ipe)%type, ipe-1, recv_tag, & comm, OutHandle(ipe,r8_win%ncall_s+m-mbeg), ierr ) enddo if (hs_local) then if (ipe-1 /= comm_pid) & call MPI_SEND ( signal, 1, mp_i4, ipe-1, ipe-1, comm, ierr ) endif endif enddo ! ! MPI: Isend/Send over all processes; use risend/rsend with hs ! InHandle(:,r8_win%ncall_s:r8_win%ncall_s+numtrm) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle ! ! Send the individual buffers with non-blocking sends ! if ( send_bl(ipe)%type /= MPI_DATATYPE_NULL ) then send_tag = comm_pid + modcam_tagoffset if (hs_local) then if (ipe-1 /= comm_pid) & call MPI_WAIT ( hs_rcvids(ipe), Status, ierr ) if (send_local) then do m = mbeg, mend call mpi_rsend( qin(:,:,:,m), 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, ierr ) enddo else do m = mbeg, mend call mpi_irsend( qin(:,:,:,m), 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, InHandle(ipe,r8_win%ncall_s), ierr ) enddo endif else if (send_local) then do m = mbeg, mend call mpi_send( qin(:,:,:,m), 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, ierr ) enddo else do m = mbeg, mend call mpi_isend( qin(:,:,:,m), 1, send_bl(ipe)%type, ipe-1, send_tag, & comm, InHandle(ipe,r8_win%ncall_s), ierr ) enddo endif endif endif enddo else ! temporary contiguous buffers jr = jlasts - jfirsts + 1 ir = ilasts - ifirsts + 1 jir = jr * ir if (r8_win%ncall_s .gt. max_irr-numtrm) then write(iulog,*) "mp_sendtrirr: insufficient window storage - exiting" write(iulog,*) "r8_win%ncall_s max_irr = ", r8_win%ncall_s, max_irr call exit(1) endif unitsize = r8_win%size/max_irr ! issue call to receive data in global receive buffer offset_0 = (r8_win%ncall_s-1)*unitsize offset_s = offset_0 offset_r = offset_0 if (hs_local) then hs_rcvids(:) = MPI_REQUEST_NULL do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle qsize = numtr*send_bl(ipe)%Tot_Size if (qsize .ne. 0) then r8_win%dest = ipe-1 send_tag = comm_pid + modcam_tagoffset if (r8_win%dest /= comm_pid) & call MPI_IRECV ( signal, 1, mp_i4, r8_win%dest, send_tag, comm, hs_rcvids(ipe), ierr ) endif enddo endif do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_r) cycle r8_win%size_r = numtr*recv_bl(ipe)%Tot_Size if (r8_win%size_r .ne. 0) then r8_win%offset_r = offset_r offset_r = offset_r + r8_win%size_r r8_win%src = ipe-1 if (numtr*unitsize >= offset_r-offset_0) then recv_tag = r8_win%src + modcam_tagoffset qsize = r8_win%size_r r8_win%nrecv = r8_win%nrecv + 1 call MPI_IRECV(ga_r8_r(r8_win%offset_r+1), qsize, mp_r8, r8_win%src, & recv_tag, comm, r8_win%rqest(r8_win%nrecv), ierror) if (hs_local) then if (r8_win%src /= comm_pid) & call MPI_SEND ( signal, 1, mp_i4, r8_win%src, recv_tag, comm, ierror) endif else write(iulog,*) "Fatal mp_sendtrirr: receive window out of space - exiting" write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif endif enddo ! gather data into global send buffer do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_s) cycle qsize = numtr*send_bl(ipe)%Tot_Size if (qsize .ne. 0) then r8_win%dest = ipe-1 r8_win%offset_s = offset_s offset_s = offset_s + qsize if (offset_s-offset_0 .gt. numtr*unitsize) then write(iulog,*) "Fatal mp_sendtrirr: send window out of space - exiting" write(iulog,*) 'comm_pid ipe unitsize offset_s offset_0 = ', comm_pid, & ipe, unitsize, offset_s, offset_0 call exit(1) endif offset_v(1) = r8_win%offset_s do j = 2, send_bl(ipe)%nparcels offset_v(j) = offset_v(j-1) + send_bl(ipe)%blocksizes(j-1) enddo do j = 1, send_bl(ipe)%nparcels do m = mbeg, mend do i = 1, send_bl(ipe)%blocksizes(j) mp = send_bl(ipe)%displacements(j)+i kp = kfirsts + (mp-1)/jir mt = (kp-kfirsts)*jir jp = jfirsts + (mp-mt-1)/ir ip = mp-mt - (jp-jfirsts)*ir + ifirsts-1 ga_r8_s(send_bl(ipe)%Tot_Size*(m-mbeg)+offset_v(j)+i) = qin(ip,jp,kp,m) enddo enddo enddo ! nonblocking send send_tag = comm_pid + modcam_tagoffset r8_win%nsend = r8_win%nsend + 1 if (hs_local) then if (r8_win%dest /= comm_pid) & call MPI_WAIT ( hs_rcvids(ipe), Status, ierr ) if (send_local) then call MPI_RSEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, & send_tag, comm, ierr) else call MPI_IRSEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, & send_tag, comm, r8_win%sqest(r8_win%nsend), ierr) endif else if (send_local) then call MPI_SEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, & send_tag, comm, ierr) else call MPI_ISEND(ga_r8_s(r8_win%offset_s+1), qsize, mp_r8, r8_win%dest, & send_tag, comm, r8_win%sqest(r8_win%nsend), ierr) endif endif endif enddo endif ! mod_method r8_win%ncall_s = r8_win%ncall_s + numtrm endif ! sw_local #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif end subroutine mp_sendtrirr !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_recvtrirr --- Finalize communication of contiguous tracer parcels ! ! !INTERFACE: subroutine mp_recvtrirr ( comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq, & 2,2 ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts, & ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr, & modc ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks integer, intent(in) :: mbeg ! initial tracer index integer, intent(in) :: mend ! final tracer index integer, intent(in) :: mq ! total tracer indices integer, intent(in) :: ifirsts ! first I index of source integer, intent(in) :: ilasts ! last I index of source integer, intent(in) :: jfirsts ! first j index of source integer, intent(in) :: jlasts ! last j index of source integer, intent(in) :: kfirsts ! first k index of source integer, intent(in) :: klasts ! last k index of source integer, intent(in) :: ifirstr ! first I index of target integer, intent(in) :: ilastr ! last I index of target integer, intent(in) :: jfirstr ! first j index of target integer, intent(in) :: jlastr ! last j index of target integer, intent(in) :: kfirstr ! first k index of target integer, intent(in) :: klastr ! last k index of target integer, optional, intent(in) :: modc(4) ! 1: classical, swap p2p, swap a2a ! 2: handshake ! 3: send vs isend ! 4: max number of outstanding requests real(r8), intent(in) :: qin(ifirsts:ilasts,jfirsts:jlasts,kfirsts:klasts,1:mq) ! input tracer array ! !OUTPUT PARAMETERS: real(r8), intent(out) :: qout(ifirstr:ilastr,jfirstr:jlastr,kfirstr:klastr,1:mq) ! output tracer array ! ! !DESCRIPTION: ! Complete transfer of a generalized region initiated by {\tt mp\_sendtrirr}. ! Communicate a number of contiguous parcels to/from arbitrary set of PEs. ! Modc(1): if 0, use original approach of posting all communications in mp_sendtrirr and ! placing wait points here; otherwise don't do anything - mp_swaptrirr is called from mp_sendirr. ! Modc(3): if 1, then use blocking send; otherwise use nonblocking send ! ! !REVISION HISTORY: ! 02.08.15 Sawyer Creation ! 02.11.06 Mirin Optimizations ! 03.03.03 Sawyer Now using packed arrays for MPI2 ! 04.02.24 Mirin Various mpi2 options ! 08.09.18 Mirin Major overhaul, to include approaches from Mirin and Worley ! !EOP !------------------------------------------------------------------------------ !BOC integer :: ipe, blocksize, offset_r, mod_method integer unitsize, offset_0 integer Ierr integer InStats(numpro*MPI_STATUS_SIZE) integer OutStats(numpro*MPI_STATUS_SIZE) integer i, j, num_r, num_s, m integer :: offset_v (Max_Nparcels) integer ipe2, ceil2num integer numtr, numtrm integer sw_local, maxreq_local logical hs_local, send_local logical sw_alltoall integer comm_size, comm_pid integer ip, jp, kp, mp, ir, jr, jir, mt if (present(modc)) then sw_local = modc(1) hs_local = (modc(2) .eq. 1) send_local = (modc(3) .eq. 1) maxreq_local = modc(4) else sw_local = 0 hs_local = .true. send_local = .true. maxreq_local = -1 endif ! Do not call mp_swaptrirr (hence return) unless mod_method equals 0 mod_method = recv_bl(1)%method if (mod_method .gt. 0) sw_local = 0 ! Return if swap_irr if (sw_local .gt. 0) return #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_SIZE (comm, comm_size, ierr) call MPI_COMM_RANK (comm, comm_pid, ierr) ceil2num = ceil2(numpro) numtrm = mend - mbeg numtr = numtrm + 1 ! num_s = 0 if this processes is not part of the sending decomposition num_s = size(send_bl) if (send_bl(1)%Nparcels == -1) then num_s = 0 endif ! num_r = 0 if this processes is not part of the receiving decomposition num_r = size(recv_bl) if (recv_bl(1)%Nparcels == -1) then num_r = 0 endif r8_win%ncall_r = r8_win%ncall_r + 1 if (mod_method .gt. 0) then ! mpi derived types if (r8_win%ncall_r .gt. MaxTrf-numtrm) then write(iulog,*) "mp_recvtrirr: derived type handle count exceeded - exiting" write(iulog,*) "r8_win%ncall_r MaxTrf = ", r8_win%ncall_r, MaxTrf call exit(1) endif if (num_s .gt. 0 .and. (.not. send_local)) then do m = mbeg, mend CALL MPI_WAITALL( comm_size, InHandle(:,r8_win%ncall_r+m-mbeg), InStats, Ierr ) enddo endif if (num_r .gt. 0) then do m = mbeg, mend CALL MPI_WAITALL( comm_size, OutHandle(:,r8_win%ncall_r+m-mbeg), OutStats, Ierr ) enddo endif else ! temporary contiguous buffer / global window jr = jlastr - jfirstr + 1 ir = ilastr - ifirstr + 1 jir = jr * ir if (r8_win%ncall_r .gt. max_irr-numtrm) then write(iulog,*) "mp_recvtrirr: insufficient window storage - exiting" write(iulog,*) "r8_win%ncall_r max_irr = ", r8_win%ncall_r, max_irr call exit(1) endif unitsize = r8_win%size/max_irr ! scatter data from global receive buffer to final destination offset_0 = (r8_win%ncall_r-1)*unitsize offset_r = offset_0 do ipe2=1, ceil2num ipe = ieor(ipe2-1,comm_pid) + 1 if (ipe .gt. num_r) cycle r8_win%size_r = numtr*recv_bl(ipe)%Tot_Size if (r8_win%size_r .ne. 0) then r8_win%offset_r = offset_r offset_r = offset_r + r8_win%size_r if (offset_r-offset_0 .gt. numtr*unitsize) then write(iulog,*) "Fatal mp_recvtrirr: receive window out of space - exiting" write(iulog,*) 'comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif r8_win%nread = r8_win%nread + 1 call MPI_WAIT(r8_win%rqest(r8_win%nread), Status, ierr) offset_v(1) = r8_win%offset_r do j = 2, recv_bl(ipe)%Nparcels offset_v(j) = offset_v(j-1) + recv_bl(ipe)%blocksizes(j-1) enddo do j = 1, recv_bl(ipe)%Nparcels do m = mbeg, mend do i = 1, recv_bl(ipe)%blocksizes(j) mp = recv_bl(ipe)%displacements(j)+i kp = kfirstr + (mp-1)/jir mt = (kp-kfirstr)*jir jp = jfirstr + (mp-mt-1)/ir ip = mp-mt - (jp-jfirstr)*ir + ifirstr-1 qout(ip,jp,kp,m) = ga_r8_r(recv_bl(ipe)%Tot_Size*(m-mbeg)+offset_v(j)+i) enddo enddo enddo endif enddo if ((r8_win%ncall_s == r8_win%ncall_r + numtrm) .and. (.not. send_local)) then call MPI_WAITALL(r8_win%nsend, r8_win%sqest, Stats, ierror) endif endif ! mod_method .gt. 0 r8_win%ncall_r = r8_win%ncall_r + numtrm if (r8_win%ncall_s == r8_win%ncall_r) then r8_win%nsend = 0 r8_win%nrecv = 0 r8_win%nread = 0 r8_win%ncall_s = 0 r8_win%ncall_r = 0 endif #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_recvtrirr !------------------------------------------------------------------------------ ! !------------------------------------------------------------------------------ !BOP ! !ROUTINE: mp_swaptrirr --- Write r8 contiguous parcels to global array ! using XOR swap ordering - for multiple tracers ! ! !INTERFACE: subroutine mp_swaptrirr ( comm, send_bl, recv_bl, qin, qout, mbeg, mend, mq, & 2,4 ifirsts, ilasts, jfirsts, jlasts, kfirsts, klasts, & ifirstr, ilastr, jfirstr, jlastr, kfirstr, klastr, & sw_handshake, sw_maxreq, sw_alltoall, sw_send ) ! !INPUT PARAMETERS: integer, intent(in) :: comm ! communicator type(blockdescriptor), intent(in) :: send_bl(:) ! send blocks type(blockdescriptor), intent(in) :: recv_bl(:) ! receive blocks integer, intent(in) :: mbeg ! initial tracer index integer, intent(in) :: mend ! final tracer index integer, intent(in) :: mq ! total tracer indices integer, intent(in) :: ifirsts ! first I index of source integer, intent(in) :: ilasts ! last I index of source integer, intent(in) :: jfirsts ! first j index of source integer, intent(in) :: jlasts ! last j index of source integer, intent(in) :: kfirsts ! first k index of source integer, intent(in) :: klasts ! last k index of source integer, intent(in) :: ifirstr ! first I index of target integer, intent(in) :: ilastr ! last I index of target integer, intent(in) :: jfirstr ! first j index of target integer, intent(in) :: jlastr ! last j index of target integer, intent(in) :: kfirstr ! first k index of target integer, intent(in) :: klastr ! last k index of target logical, optional, intent(in) :: sw_handshake ! use flow control and ! ready send integer, optional, intent(in) :: sw_maxreq ! maximum number of outstanding ! MPI requests logical, optional, intent(in) :: sw_alltoall ! use mpi_alltoall logical, optional, intent(in) :: sw_send ! use mpi_send instead of isend real(r8), intent(in) :: qin(ifirsts:ilasts,jfirsts:jlasts,kfirsts:klasts,1:mq) ! input tracer array ! !OUTPUT PARAMETERS: real(r8), intent(out) :: qout(ifirstr:ilastr,jfirstr:jlastr,kfirstr:klastr,1:mq) ! output tracer array ! ! !DESCRIPTION: ! ! XOR-ordered version of all-to-all communication ! ! WARNING: mod_comm parameter max_irr might need to be set larger than expected ! when swapping multiple variables; specifically, max_irr must be at least ! as large as the incoming r8_win%ncall_s + the number of variables to ! be swapped ! ! !REVISION HISTORY: ! 08.06.30 Worley original: derived from mp_sendirr, but using ! swapm logic and XOR swap order ! 08.08.22 Worley removed swapm; reimplemented with native MPI, ! added flow control/ready send option and maxreq ! throttling, added alltoall option ! ! !BUGS: ! !EOP !------------------------------------------------------------------------------ !BOC ! ! !LOCAL VARIABLES: integer :: i, j, p, istep, num_s, num_r integer :: comm_pid, comm_size, steps, ierr integer :: ipe, offset_s, offset_r, offset_0, unitsize integer :: sndlths(0:numpro-1), sdispls(0:numpro-1) integer :: rcvlths(0:numpro-1), rdispls(0:numpro-1) integer :: swapids(numpro) integer :: sndids(numpro) ! nonblocking MPI send request ids integer :: rcvids(numpro) ! nonblocking MPI recv request ids integer :: InStats(numpro*MPI_STATUS_SIZE) integer :: OutStats(numpro*MPI_STATUS_SIZE) integer :: offset_v integer :: rstep integer :: maxreq, maxreqh, hs logical :: handshake, alltoall, sendd integer :: ip, jp, kp, mp, irs, jrs, jirs, mt integer :: numtr, numtrm, irr, jrr, jirr, m #if defined( MODCM_TIMING ) call t_startf('mod_comm communication') #endif call MPI_COMM_SIZE (comm, comm_size, ierr) call MPI_COMM_RANK (comm, comm_pid, ierr) ! num_s = 0 if this process is not part of the sending decomposition num_s = size(send_bl) if (send_bl(1)%Nparcels == -1) then num_s = 0 endif ! num_r = 0 if this process is not part of the receiving decomposition num_r = size(recv_bl) if (recv_bl(1)%Nparcels == -1) then num_r = 0 endif if ( present(sw_handshake) ) then handshake = sw_handshake else handshake = .false. endif if ( present(sw_alltoall) ) then alltoall = sw_alltoall else alltoall = .false. endif if ( present(sw_send) ) then sendd = sw_send else sendd = .false. endif numtrm = mend - mbeg numtr = numtrm + 1 jrs = jlasts - jfirsts + 1 irs = ilasts - ifirsts + 1 jirs = jrs * irs jrr = jlastr - jfirstr + 1 irr = ilastr - ifirstr + 1 jirr = jrr * irr unitsize = r8_win%size/max_irr ! advance to unused portion of storage window r8_win%ncall_s = r8_win%ncall_s + 1 if (r8_win%ncall_s .gt. max_irr-numtrm) then write(iulog,*) "mp_swaptrirr: insufficient window storage - exiting" write(iulog,*) "r8_win%ncall_s max_irr = ", r8_win%ncall_s, max_irr call exit(1) endif ! calculate send lengths and displacements offset_0 = (r8_win%ncall_s-1)*unitsize offset_s = offset_0 sndlths(:) = 0 sdispls(:) = 0 do ipe=1, num_s sndlths(ipe-1) = numtr*send_bl(ipe)%Tot_Size sdispls(ipe-1) = offset_s if (sndlths(ipe-1) .ne. 0) then offset_s = offset_s + sndlths(ipe-1) if (offset_s-offset_0 .gt. numtr*unitsize) then write(iulog,*) "Fatal mp_swaptrirr: send window out of space - exiting" write(iulog,*) '1 comm_pid ipe unitsize offset_s offset_0 = ', comm_pid, & ipe, unitsize, offset_s, offset_0 call exit(1) endif endif enddo ! calculate receive lengths and displacements offset_r = offset_0 rcvlths(:) = 0 rdispls(:) = 0 do ipe=1, num_r rcvlths(ipe-1) = numtr*recv_bl(ipe)%Tot_Size rdispls(ipe-1) = offset_r if (rcvlths(ipe-1) .ne. 0) then offset_r = offset_r + rcvlths(ipe-1) if (numtr*unitsize < offset_r-offset_0) then write(iulog,*) "Fatal mp_swaptrirr: receive window out of space - exiting" write(iulog,*) '1 comm_pid ipe unitsize offset_r offset_0 = ', comm_pid, & ipe, unitsize, offset_r, offset_0 call exit(1) endif endif enddo ! Calculate swap partners and number of steps in point-to-point ! implementations of alltoall algorithm. steps = 0 do ipe=1,ceil2(comm_size)-1 p = pair(comm_size,ipe,comm_pid) if (p >= 0) then if (sndlths(p) > 0 .or. rcvlths(p) > 0) then steps = steps + 1 swapids(steps) = p end if end if end do if (.not. alltoall) then sndids(1:steps) = MPI_REQUEST_NULL rcvids(1:steps) = MPI_REQUEST_NULL if (steps .eq. 0) then maxreq = 0 maxreqh = 0 elseif (steps .eq. 1) then maxreq = 1 maxreqh = 1 else if ( present(sw_maxreq) ) then if ((sw_maxreq .le. steps) .and. (sw_maxreq .ge. 0)) then maxreq = sw_maxreq if (maxreq > 1) then maxreqh = maxreq/2 else maxreq = 2 maxreqh = 1 endif else maxreq = steps maxreqh = steps endif else maxreq = steps maxreqh = steps endif endif ! Post receive requests do istep=1,maxreq p = swapids(istep) offset_r = rdispls(p)+1 if (rcvlths(p) > 0) then call mpi_irecv ( ga_r8_r(offset_r), rcvlths(p), mp_r8, & p, p, comm, rcvids(istep), ierr ) if (handshake) then call mpi_send( hs, 1, mp_i4, p, p, comm, & ierr ) endif endif enddo rstep = maxreq ! endif ! gather data into global send buffer do istep=1,steps p = swapids(istep) if (sndlths(p) .ne. 0) then offset_v = sdispls(p) do j = 1, send_bl(p+1)%nparcels do m = mbeg, mend do i = 1, send_bl(p+1)%blocksizes(j) mp = send_bl(p+1)%displacements(j)+i kp = kfirsts + (mp-1)/jirs mt = (kp-kfirsts)*jirs jp = jfirsts + (mp-mt-1)/irs ip = mp-mt - (jp-jfirsts)*irs + ifirsts-1 ga_r8_s(send_bl(p+1)%Tot_Size*(m-mbeg)+offset_v+i) = qin(ip,jp,kp,m) enddo enddo offset_v = offset_v + send_bl(p+1)%blocksizes(j) enddo endif if (.not. alltoall) then ! Submit new i(r)send request offset_s = sdispls(p)+1 if (sndlths(p) > 0) then if (handshake) then call mpi_recv ( hs, 1, mp_i4, p, comm_pid, comm, & OutStats, ierr ) if (sendd) then call mpi_rsend( ga_r8_s(offset_s), sndlths(p), mp_r8, & p, comm_pid, comm, ierr ) else call mpi_irsend( ga_r8_s(offset_s), sndlths(p), mp_r8, & p, comm_pid, comm, sndids(istep), ierr ) endif else if (sendd) then call mpi_send ( ga_r8_s(offset_s), sndlths(p), mp_r8, & p, comm_pid, comm, ierr ) else call mpi_isend ( ga_r8_s(offset_s), sndlths(p), mp_r8, & p, comm_pid, comm, sndids(istep), ierr ) endif endif endif if (istep > maxreqh) then ! Wait for oldest irecv request to complete call mpi_wait( rcvids(istep-maxreqh), OutStats, ierr ) ! Submit a new irecv request if (rstep < steps) then rstep = rstep + 1 p = swapids(rstep) offset_r = rdispls(p)+1 if (rcvlths(p) > 0) then call mpi_irecv( ga_r8_r(offset_r), rcvlths(p), mp_r8, & p, p, comm, rcvids(rstep), ierr ) if (handshake) then call mpi_send ( hs, 1, mp_i4, p, p, comm, & ierr ) endif endif endif ! Wait for outstanding i(r)send request to complete if (.not. sendd) then call mpi_wait( sndids(istep-maxreqh), InStats, ierr ) endif endif ! endif ! enddo ! local copy to send buffer if (sndlths(comm_pid) .ne. 0) then offset_v = sdispls(comm_pid) do j = 1, send_bl(comm_pid+1)%nparcels do m = mbeg, mend do i = 1, send_bl(comm_pid+1)%blocksizes(j) mp = send_bl(comm_pid+1)%displacements(j)+i kp = kfirsts + (mp-1)/jirs mt = (kp-kfirsts)*jirs jp = jfirsts + (mp-mt-1)/irs ip = mp-mt - (jp-jfirsts)*irs + ifirsts-1 ga_r8_s(send_bl(comm_pid+1)%Tot_Size*(m-mbeg)+offset_v+i) = qin(ip,jp,kp,m) enddo enddo offset_v = offset_v + send_bl(comm_pid+1)%blocksizes(j) enddo if (.not. alltoall) then ga_r8_r(rdispls(comm_pid)+1:rdispls(comm_pid)+rcvlths(comm_pid)) = & ga_r8_s(sdispls(comm_pid)+1:sdispls(comm_pid)+sndlths(comm_pid)) endif endif if (alltoall) then call mpi_alltoallv (ga_r8_s, sndlths, sdispls, mp_r8, & ga_r8_r, rcvlths, rdispls, mp_r8, & comm, ierror) endif ! local copy from receive buffer if (rcvlths(comm_pid) .ne. 0) then offset_v = rdispls(comm_pid) do j = 1, recv_bl(comm_pid+1)%Nparcels do m = mbeg, mend do i = 1, recv_bl(comm_pid+1)%blocksizes(j) mp = recv_bl(comm_pid+1)%displacements(j)+i kp = kfirstr + (mp-1)/jirr mt = (kp-kfirstr)*jirr jp = jfirstr + (mp-mt-1)/irr ip = mp-mt - (jp-jfirstr)*irr + ifirstr-1 qout(ip,jp,kp,m) = ga_r8_r(recv_bl(comm_pid+1)%Tot_Size*(m-mbeg)+offset_v+i) enddo enddo offset_v = offset_v + recv_bl(comm_pid+1)%blocksizes(j) enddo endif ! scatter data from global receive buffer to final destination do istep=1,steps p = swapids(istep) if (.not. alltoall) then if (istep > steps-maxreqh) then call mpi_wait( rcvids(istep), OutStats, ierr ) endif endif if (rcvlths(p) .ne. 0) then offset_v = rdispls(p) do j = 1, recv_bl(p+1)%Nparcels do m = mbeg, mend do i = 1, recv_bl(p+1)%blocksizes(j) mp = recv_bl(p+1)%displacements(j)+i kp = kfirstr + (mp-1)/jirr mt = (kp-kfirstr)*jirr jp = jfirstr + (mp-mt-1)/irr ip = mp-mt - (jp-jfirstr)*irr + ifirstr-1 qout(ip,jp,kp,m) = ga_r8_r(recv_bl(p+1)%Tot_Size*(m-mbeg)+offset_v+i) enddo enddo offset_v = offset_v + recv_bl(p+1)%blocksizes(j) enddo endif enddo ! Wait for any outstanding send requests to complete. if (.not. alltoall .and. .not. sendd) then call mpi_waitall( maxreqh, sndids(steps-maxreqh+1), InStats, ierr ) endif ! clean-up ! make used portion of storage window available for reuse r8_win%ncall_s = r8_win%ncall_s - 1 #if defined( MODCM_TIMING ) call t_stopf('mod_comm communication') #endif !EOC end subroutine mp_swaptrirr # endif !------------------------------------------------------------------------------ #endif end module mod_comm