module phys_grid 58,8 !----------------------------------------------------------------------- ! ! Purpose: Definition of physics computational horizontal grid. ! ! Method: Variables are private; interface routines used to extract ! information for use in user code. ! ! Entry points: ! phys_grid_init initialize chunk'ed data structure ! phys_grid_initialized get physgrid_set flag ! ! phys_grid_defaultopts get default runtime options ! phys_grid_setopts set runtime options ! ! get_chunk_indices_p get local chunk index range ! get_ncols_p get number of columns for a given chunk ! get_xxx_all_p get global indices, coordinates, or values ! for a given chunk ! get_xxx_vec_p get global indices, coordinates, or values ! for a subset of the columns in a chunk ! get_xxx_p get global indices, coordinates, or values ! for a single column ! where xxx is ! area for column surface area (in radians squared) ! gcol for global column index ! lat for global latitude index ! lon for global longitude index ! rlat for latitude coordinate (in radians) ! rlon for longitude coordinate (in radians) ! wght for column integration weight ! ! scatter_field_to_chunk ! distribute field ! to decomposed chunk data structure ! gather_chunk_to_field ! reconstruct field ! from decomposed chunk data structure ! ! read_chunk_from_field ! read and distribute field ! to decomposed chunk data structure ! write_field_from_chunk ! write field ! from decomposed chunk data structure ! ! block_to_chunk_send_pters ! return pointers into send buffer where data ! from decomposed fields should ! be copied to ! block_to_chunk_recv_pters ! return pointers into receive buffer where data ! for decomposed chunk data structures should ! be copied from ! transpose_block_to_chunk ! transpose buffer containing decomposed ! fields to buffer ! containing decomposed chunk data structures ! ! chunk_to_block_send_pters ! return pointers into send buffer where data ! from decomposed chunk data structures should ! be copied to ! chunk_to_block_recv_pters ! return pointers into receive buffer where data ! for decomposed fields should ! be copied from ! transpose_chunk_to_block ! transpose buffer containing decomposed ! chunk data structures to buffer ! containing decomposed fields ! ! chunk_index identify whether index is for a latitude or ! a chunk ! ! FOLLOWING ARE NO LONGER USED, AND ARE CURRENTLY COMMENTED OUT ! get_gcol_owner_p get owner of column ! for given global physics column index ! ! buff_to_chunk Copy from local buffer to local chunk data ! structure. (Needed for cpl6.) ! ! chunk_to_buff Copy from local chunk data structure to ! local buffer. (Needed for cpl6.) ! ! Author: Patrick Worley and John Drake ! !----------------------------------------------------------------------- use shr_kind_mod, only: r8 => shr_kind_r8, r4 => shr_kind_r4 use physconst, only: pi use ppgrid, only: pcols, pver, begchunk, endchunk #if ( defined SPMD ) use spmd_dyn, only: proc, block_buf_nrecs, chunk_buf_nrecs, & local_dp_map use mpishorthand #endif use spmd_utils, only: iam, masterproc, npes, proc_smp_map, nsmps use m_MergeSorts, only: IndexSet, IndexSort use abortutils, only: endrun use perf_mod use cam_logfile, only: iulog implicit none save #if ( ! defined SPMD ) integer, private :: block_buf_nrecs integer, private :: chunk_buf_nrecs logical, private :: local_dp_map=.true. #endif ! dynamics field grid information integer, private :: hdim1_d, hdim2_d ! dimensions of rectangular horizontal grid ! data structure, If 1D data structure, then ! hdim2_d == 1. ! physics field data structures integer :: ngcols ! global column count in physics grid (all) integer, public :: ngcols_p ! global column count in physics grid ! (without holes) integer, dimension(:), allocatable, private :: dyn_to_latlon_gcol_map ! map from unsorted (dynamics) to lat/lon sorted grid indices integer, dimension(:), allocatable, private :: latlon_to_dyn_gcol_map ! map from lat/lon sorted grid to unsorted (dynamics) indices integer, dimension(:), allocatable, private :: lonlat_to_dyn_gcol_map ! map from lon/lat sorted grid to unsorted (dynamics) indices ! integer, private :: clat_p_tot ! number of unique latitudes ! integer, private :: clon_p_tot ! number of unique longitudes ! these are public to support mozart chemistry in the short term integer, public :: clat_p_tot ! number of unique latitudes integer, public :: clon_p_tot ! number of unique longitudes integer, dimension(:), allocatable, private :: clat_p_cnt ! number of repeats for each latitude integer, dimension(:), allocatable, private :: clat_p_idx ! index in latlon ordering for first occurence ! of latitude corresponding to given ! latitude index real(r8), dimension(:), allocatable :: clat_p ! unique latitudes (radians, increasing) integer, dimension(:), allocatable, private :: clon_p_cnt ! number of repeats for each longitude real(r8), dimension(:), allocatable :: clon_p ! unique longitudes (radians, increasing) integer, dimension(:), allocatable, private :: lat_p ! index into list of unique column latitudes integer, dimension(:), allocatable, private :: lon_p ! index into list of unique column longitudes ! chunk data structures type chunk integer :: ncols ! number of vertical columns integer :: gcol(pcols) ! global physics column indices integer :: lon(pcols) ! global longitude indices integer :: lat(pcols) ! global latitude indices integer :: owner ! id of process where chunk assigned integer :: lcid ! local chunk index end type chunk integer :: nchunks ! global chunk count type (chunk), dimension(:), allocatable, public :: chunks ! global computational grid integer, dimension(:), allocatable, private :: npchunks ! number of chunks assigned to each process type lchunk integer :: ncols ! number of vertical columns integer :: cid ! global chunk index integer :: gcol(pcols) ! global physics column indices real(r8) :: area(pcols) ! column surface area (from dynamics) real(r8) :: wght(pcols) ! column integration weight (from dynamics) end type lchunk integer, private :: nlchunks ! local chunk count type (lchunk), dimension(:), allocatable, private :: lchunks ! local chunks type knuhc integer :: chunkid ! chunk id integer :: col ! column index in chunk end type knuhc type (knuhc), dimension(:), allocatable, private :: knuhcs ! map from global column indices ! to chunk'ed grid ! column mapping data structures type column_map integer :: chunk ! global chunk index integer :: ccol ! column ordering in chunk end type column_map integer, private :: nlcols ! local column count type (column_map), dimension(:), allocatable, private :: pgcols ! ordered list of columns (for use in gather/scatter) ! NOTE: consistent with local ordering ! column remap data structures integer, dimension(:), allocatable, private :: gs_col_num ! number of columns scattered to each process in ! field_to_chunk scatter integer, dimension(:), allocatable, private :: gs_col_offset ! offset of columns (-1) in pgcols scattered to ! each process in field_to_chunk scatter integer, dimension(:), allocatable, private :: btofc_blk_num ! number of grid points scattered to each process in ! block_to_chunk alltoallv, and gathered from each ! process in chunk_to_block alltoallv integer, dimension(:), allocatable, private :: btofc_chk_num ! number of grid points gathered from each process in ! block_to_chunk alltoallv, and scattered to each ! process in chunk_to_block alltoallv type btofc_pters integer :: ncols ! number of columns in block integer :: nlvls ! number of levels in columns integer, dimension(:,:), pointer :: pter end type btofc_pters type (btofc_pters), dimension(:), allocatable, private :: btofc_blk_offset ! offset in btoc send array (-1) where ! (blockid, bcid, k) column should be packed in ! block_to_chunk alltoallv, AND ! offset in ctob receive array (-1) from which ! (blockid, bcid, k) column should be unpacked in ! chunk_to_block alltoallv type (btofc_pters), dimension(:), allocatable, private :: btofc_chk_offset ! offset in btoc receive array (-1) from which ! (lcid, i, k) data should be unpacked in ! block_to_chunk alltoallv, AND ! offset in ctob send array (-1) where ! (lcid, i, k) data should be packed in ! chunk_to_block alltoallv ! miscellaneous phys_grid data integer, private :: dp_coup_steps ! number of swaps in transpose algorithm integer, dimension(:), private, allocatable :: dp_coup_proc ! swap partner in each step of ! transpose algorithm logical :: physgrid_set = .false. ! flag indicates physics grid has been set integer, private :: max_nproc_smpx ! maximum number of processes assigned to a ! single virtual SMP used to define physics ! load balancing integer, private :: nproc_busy_d ! number of processes active during the dynamics ! (assigned a dynamics block) ! Physics grid decomposition options: ! -1: each chunk is a dynamics block ! 0: chunk definitions and assignments do not require interprocess comm. ! 1: chunk definitions and assignments do not require internode comm. ! 2: chunk definitions and assignments may require communication between all processes ! 3: chunk definitions and assignments only require communication with one other process ! 4: concatenated blocks, no load balancing, no interprocess communication integer, private, parameter :: min_lbal_opt = -1 integer, private, parameter :: max_lbal_opt = 5 integer, private, parameter :: def_lbal_opt = 0 ! default integer, private :: lbal_opt = def_lbal_opt ! Physics grid load balancing options: ! 0: assign columns to chunks as single columns, wrap mapped across chunks ! 1: use (day/night; north/south) twin algorithm to determine load-balanced pairs of ! columns and assign columns to chunks in pairs, wrap mapped integer, private, parameter :: min_twin_alg = 0 integer, private, parameter :: max_twin_alg = 1 integer, private, parameter :: def_twin_alg_lonlat = 1 ! default integer, private, parameter :: def_twin_alg_unstructured = 0 integer, private :: twin_alg = def_twin_alg_lonlat ! target number of chunks per thread integer, private, parameter :: min_chunks_per_thread = 1 integer, private, parameter :: def_chunks_per_thread = & min_chunks_per_thread ! default integer, private :: chunks_per_thread = def_chunks_per_thread ! Dynamics/physics transpose method for nonlocal load-balance: ! -1: use "0" if max_nproc_smpx and nproc_busy_d are both > npes/2; otherwise use "1" ! 0: use mpi_alltoallv ! 1: use point-to-point MPI-1 two-sided implementation ! 2: use point-to-point MPI-2 one-sided implementation if supported, ! otherwise use MPI-1 implementation ! 3: use Co-Array Fortran implementation if supported, ! otherwise use MPI-1 implementation ! 11-13: use mod_comm, choosing any of several methods internal to mod_comm. ! The method within mod_comm (denoted mod_method) has possible values 0,1,2 and ! is set according to mod_method = phys_alltoall - modmin_alltoall, where ! modmin_alltoall is 11. integer, private, parameter :: min_alltoall = -1 integer, private, parameter :: max_alltoall = 3 # if defined(MODCM_DP_TRANSPOSE) integer, private, parameter :: modmin_alltoall = 11 integer, private, parameter :: modmax_alltoall = 13 # endif integer, private, parameter :: def_alltoall = -1 ! default integer, private :: phys_alltoall = def_alltoall contains !======================================================================== integer function get_nlcols_p() 2 get_nlcols_p = nlcols end function get_nlcols_p integer function get_clon_p_tot() get_clon_p_tot = clon_p_tot end function get_clon_p_tot integer function get_clat_p_tot() get_clat_p_tot = clat_p_tot end function get_clat_p_tot subroutine phys_grid_init( ) 2,39 !----------------------------------------------------------------------- ! ! Purpose: Physics mapping initialization routine: ! ! Method: ! ! Author: John Drake and Patrick Worley ! !----------------------------------------------------------------------- use pmgrid, only: plev use dyn_grid, only: get_block_bounds_d, & get_block_gcol_d, get_block_gcol_cnt_d, & get_block_levels_d, get_block_lvl_cnt_d, & get_block_owner_d, & get_gcol_block_d, get_gcol_block_cnt_d, & get_horiz_grid_dim_d, get_horiz_grid_d use spmd_utils, only: pair, ceil2 ! !------------------------------Arguments-------------------------------- ! ! !---------------------------Local workspace----------------------------- ! integer :: i, j, jb, k, p ! loop indices integer :: pre_i ! earlier index in loop iteration integer :: clat_p_dex, clon_p_dex ! indices into unique lat. and lon. arrays integer :: maxblksiz ! maximum number of columns in a dynamics block integer :: beg_dex, end_dex ! index range integer :: cid, lcid ! global and local chunk ids integer :: max_ncols ! upper bound on number of columns in a block integer :: ncols ! number of columns in current chunk integer :: curgcol, curgcol_d ! current global column index integer :: firstblock, lastblock ! global block indices integer :: blksiz ! current block size integer :: glbcnt, curcnt ! running grid point counts integer :: curp ! current process id integer :: block_cnt ! number of blocks containing data ! for a given vertical column integer :: numlvl ! number of vertical levels in block ! column integer :: levels(plev+1) ! vertical level indices integer :: owner_d ! process owning given block column integer :: owner_p ! process owning given chunk column integer :: blockids(plev+1) ! block indices integer :: bcids(plev+1) ! block column indices integer :: glon, glat ! global (lon,lat) indices integer :: ntmp1, ntmp2 ! work variables logical :: clon_wrap ! flag used in initializing lat_p, lon_p ! column surface area (from dynamics) real(r8), dimension(:), allocatable :: area_d ! column integration weight (from dynamics) real(r8), dimension(:), allocatable :: wght_d ! chunk global ordering integer, dimension(:), allocatable :: pchunkid ! permutation array used in physics column sorting; ! reused later as work space in (lbal_opt == -1) logic integer, dimension(:), allocatable :: cdex ! latitudes and longitudes and column area for dynamics columns real(r8), dimension(:), allocatable :: clat_d real(r8), dimension(:), allocatable :: clon_d real(r8) :: clat_p_tmp real(r8) :: clon_p_tmp integer lons(2), lats(2) call print_memusage('phys_grid_init: start') call t_adj_detailf(-2) call t_startf("phys_grid_init") !----------------------------------------------------------------------- ! ! Initialize physics grid, using dynamics grid ! a) column coordinates call get_horiz_grid_dim_d(hdim1_d,hdim2_d) ngcols = hdim1_d*hdim2_d allocate( clat_d(1:ngcols) ) allocate( clon_d(1:ngcols) ) allocate( cdex(1:ngcols) ) call print_memusage('phys_grid_init: after allocate 1') clat_d = 100000.0_r8 clon_d = 100000.0_r8 call get_horiz_grid_d(ngcols, clat_d_out=clat_d, clon_d_out=clon_d) ! count number of "real" column indices ngcols_p = 0 do i=1,ngcols if (clon_d(i) < 100000.0_r8) then ngcols_p = ngcols_p + 1 endif enddo ! sort over longitude and identify unique longitude coordinates call IndexSet(ngcols,cdex) call IndexSort(ngcols,cdex,clon_d,descend=.false.) clon_p_tmp = clon_d(cdex(1)) clon_p_tot = 1 do i=2,ngcols_p if (clon_d(cdex(i)) > clon_p_tmp) then clon_p_tot = clon_p_tot + 1 clon_p_tmp = clon_d(cdex(i)) endif enddo allocate( clon_p(1:clon_p_tot) ) allocate( clon_p_cnt(1:clon_p_tot) ) call print_memusage('phys_grid_init: after allocate 2') pre_i = 1 clon_p_tot = 1 clon_p(1) = clon_d(cdex(1)) do i=2,ngcols_p if (clon_d(cdex(i)) > clon_p(clon_p_tot)) then clon_p_cnt(clon_p_tot) = i-pre_i pre_i = i clon_p_tot = clon_p_tot + 1 clon_p(clon_p_tot) = clon_d(cdex(i)) endif enddo clon_p_cnt(clon_p_tot) = (ngcols_p+1)-pre_i ! sort over latitude and identify unique latitude coordinates call IndexSet(ngcols,cdex) call IndexSort(ngcols,cdex,clat_d,descend=.false.) clat_p_tmp = clat_d(cdex(1)) clat_p_tot = 1 do i=2,ngcols_p if (clat_d(cdex(i)) > clat_p_tmp) then clat_p_tot = clat_p_tot + 1 clat_p_tmp = clat_d(cdex(i)) endif enddo allocate( clat_p(1:clat_p_tot) ) allocate( clat_p_cnt(1:clat_p_tot) ) allocate( clat_p_idx(1:clat_p_tot) ) call print_memusage('phys_grid_init: after allocate 3') pre_i = 1 clat_p_tot = 1 clat_p(1) = clat_d(cdex(1)) do i=2,ngcols_p if (clat_d(cdex(i)) > clat_p(clat_p_tot)) then clat_p_cnt(clat_p_tot) = i-pre_i pre_i = i clat_p_tot = clat_p_tot + 1 clat_p(clat_p_tot) = clat_d(cdex(i)) endif enddo clat_p_cnt(clat_p_tot) = (ngcols_p+1)-pre_i clat_p_idx(1) = 1 do j=2,clat_p_tot clat_p_idx(j) = clat_p_idx(j-1) + clat_p_cnt(j-1) enddo ! sort by longitude within latitudes end_dex = 0 do j=1,clat_p_tot beg_dex = end_dex + 1 end_dex = end_dex + clat_p_cnt(j) call IndexSort(cdex(beg_dex:end_dex),clon_d,descend=.false.) enddo ! Early clean-up, to minimize memory high water mark ! (not executing find_partner or find_twin) if (((twin_alg .ne. 1) .and. (lbal_opt .ne. 3)) .or. & (lbal_opt .eq. -1)) deallocate( clat_p_cnt) ! save "longitude within latitude" column ordering ! and determine mapping from unsorted global column index to ! unique latitude/longitude indices allocate( lat_p(1:ngcols) ) allocate( lon_p(1:ngcols) ) allocate( dyn_to_latlon_gcol_map(1:ngcols) ) if (lbal_opt .ne. -1) allocate( latlon_to_dyn_gcol_map(1:ngcols_p) ) call print_memusage('phys_grid_init: after allocate 3') clat_p_dex = 1 lat_p = -1 dyn_to_latlon_gcol_map = -1 do i=1,ngcols_p if (lbal_opt .ne. -1) latlon_to_dyn_gcol_map(i) = cdex(i) dyn_to_latlon_gcol_map(cdex(i)) = i do while ((clat_p(clat_p_dex) < clat_d(cdex(i))) .and. & (clat_p_dex < clat_p_tot)) clat_p_dex = clat_p_dex + 1 enddo lat_p(cdex(i)) = clat_p_dex enddo ! sort by latitude within longitudes call IndexSet(ngcols,cdex) call IndexSort(ngcols,cdex,clon_d,descend=.false.) end_dex = 0 do i=1,clon_p_tot beg_dex = end_dex + 1 end_dex = end_dex + clon_p_cnt(i) call IndexSort(cdex(beg_dex:end_dex),clat_d,descend=.false.) enddo ! Early clean-up, to minimize memory high water mark ! (not executing find_twin) if ((twin_alg .ne. 1) .or. (lbal_opt .eq. -1)) deallocate( clon_p_cnt ) ! save "latitude within longitude" column ordering ! (only need in find_twin) if ((twin_alg .eq. 1) .and. (lbal_opt .ne. -1)) & allocate( lonlat_to_dyn_gcol_map(1:ngcols_p) ) clon_p_dex = 1 lon_p = -1 do i=1,ngcols_p if ((twin_alg .eq. 1) .and. (lbal_opt .ne. -1)) & lonlat_to_dyn_gcol_map(i) = cdex(i) do while ((clon_p(clon_p_dex) < clon_d(cdex(i))) .and. & (clon_p_dex < clon_p_tot)) clon_p_dex = clon_p_dex + 1 enddo lon_p(cdex(i)) = clon_p_dex enddo ! Clean-up deallocate( clat_d ) deallocate( clon_d ) deallocate( cdex ) ! ! Determine block index bounds ! call get_block_bounds_d(firstblock,lastblock) ! Allocate storage to save number of chunks and columns assigned to each ! process during chunk creation and assignment ! allocate( npchunks(0:npes-1) ) allocate( gs_col_num(0:npes-1) ) npchunks(:) = 0 gs_col_num(:) = 0 ! ! Option -1: each dynamics block is a single chunk ! if (lbal_opt == -1) then ! ! Check that pcols >= maxblksiz ! maxblksiz = 0 do jb=firstblock,lastblock maxblksiz = max(maxblksiz,get_block_gcol_cnt_d(jb)) enddo if (pcols < maxblksiz) then write(iulog,*) 'pcols = ',pcols, ' maxblksiz=',maxblksiz call endrun ('PHYS_GRID_INIT error: phys_loadbalance -1 specified but PCOLS < MAXBLKSIZ') endif ! ! Determine total number of chunks ! nchunks = (lastblock-firstblock+1) ! ! Set max virtual SMP node size ! max_nproc_smpx = 1 ! ! Allocate and initialize chunks data structure ! allocate( cdex(1:maxblksiz) ) allocate( chunks(1:nchunks) ) do cid=1,nchunks ! get number of global column indices in block max_ncols = get_block_gcol_cnt_d(cid+firstblock-1) ! fill cdex array with global indices from current block call get_block_gcol_d(cid+firstblock-1,max_ncols,cdex) ncols = 0 do i=1,max_ncols ! check whether global index is for a column that dynamics ! intends to pass to the physics curgcol_d = cdex(i) if (dyn_to_latlon_gcol_map(curgcol_d) .ne. -1) then ! yes - then save the information ncols = ncols + 1 chunks(cid)%gcol(ncols) = curgcol_d chunks(cid)%lat(ncols) = lat_p(curgcol_d) chunks(cid)%lon(ncols) = lon_p(curgcol_d) endif enddo chunks(cid)%ncols = ncols enddo ! Clean-up deallocate( cdex ) deallocate( lat_p ) deallocate( lon_p ) ! ! Specify parallel decomposition ! do cid=1,nchunks #if (defined SPMD) p = get_block_owner_d(cid+firstblock-1) #else p = 0 #endif chunks(cid)%owner = p npchunks(p) = npchunks(p) + 1 gs_col_num(p) = gs_col_num(p) + chunks(cid)%ncols enddo ! ! Set flag indicating columns in physics and dynamics ! decompositions reside on the same processes ! local_dp_map = .true. ! else ! ! Option == 0: split local blocks into chunks, ! while attempting to create load-balanced chunks. ! Does not work with vertically decomposed blocks. ! (default) ! Option == 1: split SMP-local blocks into chunks, ! while attempting to create load-balanced chunks. ! Does not work with vertically decomposed blocks. ! Option == 2: load balance chunks with respect to diurnal and ! seaonsal cycles and wth respect to latitude, ! and assign chunks to processes ! in a way that attempts to minimize communication costs ! Option == 3: divide processes into pairs and split ! blocks assigned to these pairs into ! chunks, attempting to create load-balanced chunks. ! The process pairs are chosen to maximize load balancing ! opportunities. ! Does not work with vertically decomposed blocks. ! Option == 4: concatenate local blocks, then ! divide into chunks. ! Does not work with vertically decomposed blocks. ! Option == 5: split indiviudal blocks into chunks, ! assigning columns using block ordering ! ! ! Allocate and initialize chunks data structure, then ! assign chunks to processes. ! call create_chunks(lbal_opt, chunks_per_thread) ! Early clean-up, to minimize memory high water mark deallocate( lat_p ) deallocate( lon_p ) deallocate( latlon_to_dyn_gcol_map ) if (twin_alg .eq. 1) deallocate( lonlat_to_dyn_gcol_map ) if (twin_alg .eq. 1) deallocate( clon_p_cnt ) if ((twin_alg .eq. 1) .or. (lbal_opt .eq. 3)) deallocate( clat_p_cnt ) ! ! Determine whether dynamics and physics decompositions ! are colocated, not requiring any interprocess communication ! in the coupling. local_dp_map = .true. do cid=1,nchunks do i=1,chunks(cid)%ncols curgcol_d = chunks(cid)%gcol(i) block_cnt = get_gcol_block_cnt_d(curgcol_d) call get_gcol_block_d(curgcol_d,block_cnt,blockids,bcids) do jb=1,block_cnt owner_d = get_block_owner_d(blockids(jb)) if (owner_d .ne. chunks(cid)%owner) then local_dp_map = .false. endif enddo enddo enddo endif ! ! Allocate and initialize data structures for gather/scatter ! allocate( pgcols(1:ngcols_p) ) allocate( gs_col_offset(0:npes) ) allocate( pchunkid(0:npes) ) ! Initialize pchunkid and gs_col_offset by summing ! number of chunks and columns per process, respectively pchunkid(0) = 0 gs_col_offset(0) = 0 do p=1,npes-1 pchunkid(p) = pchunkid(p-1) + npchunks(p-1) gs_col_offset(p) = gs_col_offset(p-1) + gs_col_num(p-1) enddo ! Determine local ordering via "process id" bin sort do cid=1,nchunks p = chunks(cid)%owner pchunkid(p) = pchunkid(p) + 1 chunks(cid)%lcid = pchunkid(p) + lastblock curgcol = gs_col_offset(p) do i=1,chunks(cid)%ncols curgcol = curgcol + 1 pgcols(curgcol)%chunk = cid pgcols(curgcol)%ccol = i enddo gs_col_offset(p) = curgcol enddo ! Reinitialize pchunkid and gs_col_offset (for real) pchunkid(0) = 1 gs_col_offset(0) = 1 do p=1,npes-1 pchunkid(p) = pchunkid(p-1) + npchunks(p-1) gs_col_offset(p) = gs_col_offset(p-1) + gs_col_num(p-1) enddo pchunkid(npes) = pchunkid(npes-1) + npchunks(npes-1) gs_col_offset(npes) = gs_col_offset(npes-1) + gs_col_num(npes-1) ! Save local information ! (Local chunk index range chosen so that it does not overlap ! {begblock,...,endblock}) ! nlcols = gs_col_num(iam) nlchunks = npchunks(iam) begchunk = pchunkid(iam) + lastblock endchunk = pchunkid(iam+1) + lastblock - 1 ! allocate( lchunks(begchunk:endchunk) ) do cid=1,nchunks if (chunks(cid)%owner == iam) then lcid = chunks(cid)%lcid lchunks(lcid)%ncols = chunks(cid)%ncols lchunks(lcid)%cid = cid do i=1,chunks(cid)%ncols lchunks(lcid)%gcol(i) = chunks(cid)%gcol(i) enddo endif enddo deallocate( pchunkid ) deallocate( npchunks ) ! !----------------------------------------------------------------------- ! ! Initialize physics grid, using dynamics grid ! b) column area and integration weight allocate( area_d(1:ngcols) ) allocate( wght_d(1:ngcols) ) area_d = 0.0_r8 wght_d = 0.0_r8 call print_memusage('phys_grid_init: after allocate 4') call get_horiz_grid_d(ngcols, area_d_out=area_d, wght_d_out=wght_d) if ( abs(sum(area_d) - 4.0_r8*pi) > 1.e-10_r8 ) then write(iulog,*) ' ERROR: sum of areas on globe does not equal 4*pi' write(iulog,*) ' sum of areas = ', sum(area_d), sum(area_d)-4.0_r8*pi call endrun('phys_grid') end if if ( abs(sum(wght_d) - 4.0_r8*pi) > 1.e-10_r8 ) then write(iulog,*) ' ERROR: sum of integration weights on globe does not equal 4*pi' write(iulog,*) ' sum of weights = ', sum(wght_d), sum(wght_d)-4.0_r8*pi call endrun('phys_grid') end if do lcid=begchunk,endchunk do i=1,lchunks(lcid)%ncols lchunks(lcid)%area(i) = area_d(lchunks(lcid)%gcol(i)) lchunks(lcid)%wght(i) = wght_d(lchunks(lcid)%gcol(i)) enddo enddo deallocate( area_d ) deallocate( wght_d ) if (.not. local_dp_map) then ! ! allocate and initialize data structures for transposes ! allocate( btofc_blk_num(0:npes-1) ) btofc_blk_num = 0 allocate( btofc_blk_offset(firstblock:lastblock) ) do jb = firstblock,lastblock nullify( btofc_blk_offset(jb)%pter ) enddo ! glbcnt = 0 curcnt = 0 curp = 0 do curgcol=1,ngcols_p cid = pgcols(curgcol)%chunk i = pgcols(curgcol)%ccol owner_p = chunks(cid)%owner do while (curp < owner_p) btofc_blk_num(curp) = curcnt curcnt = 0 curp = curp + 1 enddo curgcol_d = chunks(cid)%gcol(i) block_cnt = get_gcol_block_cnt_d(curgcol_d) call get_gcol_block_d(curgcol_d,block_cnt,blockids,bcids) do jb = 1,block_cnt owner_d = get_block_owner_d(blockids(jb)) if (iam == owner_d) then if (.not. associated(btofc_blk_offset(blockids(jb))%pter)) then blksiz = get_block_gcol_cnt_d(blockids(jb)) numlvl = get_block_lvl_cnt_d(blockids(jb),bcids(jb)) btofc_blk_offset(blockids(jb))%ncols = blksiz btofc_blk_offset(blockids(jb))%nlvls = numlvl allocate( btofc_blk_offset(blockids(jb))%pter(blksiz,numlvl) ) endif do k=1,btofc_blk_offset(blockids(jb))%nlvls btofc_blk_offset(blockids(jb))%pter(bcids(jb),k) = glbcnt curcnt = curcnt + 1 glbcnt = glbcnt + 1 enddo endif enddo enddo btofc_blk_num(curp) = curcnt block_buf_nrecs = glbcnt ! allocate( btofc_chk_num(0:npes-1) ) btofc_chk_num = 0 allocate( btofc_chk_offset(begchunk:endchunk) ) do lcid=begchunk,endchunk ncols = lchunks(lcid)%ncols btofc_chk_offset(lcid)%ncols = ncols btofc_chk_offset(lcid)%nlvls = pver+1 allocate( btofc_chk_offset(lcid)%pter(ncols,pver+1) ) enddo ! curcnt = 0 glbcnt = 0 do p=0,npes-1 do curgcol=gs_col_offset(iam),gs_col_offset(iam+1)-1 cid = pgcols(curgcol)%chunk owner_p = chunks(cid)%owner if (iam == owner_p) then i = pgcols(curgcol)%ccol lcid = chunks(cid)%lcid curgcol_d = chunks(cid)%gcol(i) block_cnt = get_gcol_block_cnt_d(curgcol_d) call get_gcol_block_d(curgcol_d,block_cnt,blockids,bcids) do jb = 1,block_cnt owner_d = get_block_owner_d(blockids(jb)) if (p == owner_d) then numlvl = get_block_lvl_cnt_d(blockids(jb),bcids(jb)) call get_block_levels_d(blockids(jb),bcids(jb),numlvl,levels) do k=1,numlvl btofc_chk_offset(lcid)%pter(i,levels(k)+1) = glbcnt curcnt = curcnt + 1 glbcnt = glbcnt + 1 enddo endif enddo endif enddo btofc_chk_num(p) = curcnt curcnt = 0 enddo chunk_buf_nrecs = glbcnt ! ! Precompute swap partners and number of steps in point-to-point ! implementations of alltoall algorithm. ! First, determine number of swaps. ! dp_coup_steps = 0 do i=1,ceil2(npes)-1 p = pair(npes,i,iam) if (p >= 0) then if ((btofc_blk_num(p) > 0 .or. btofc_chk_num(p) > 0)) then dp_coup_steps = dp_coup_steps + 1 end if end if end do ! ! Second, determine swap partners. ! allocate( dp_coup_proc(dp_coup_steps) ) dp_coup_steps = 0 do i=1,ceil2(npes)-1 p = pair(npes,i,iam) if (p >= 0) then if ((btofc_blk_num(p) > 0 .or. btofc_chk_num(p) > 0)) then dp_coup_steps = dp_coup_steps + 1 dp_coup_proc(dp_coup_steps) = p end if end if end do ! endif ! Final clean-up deallocate( gs_col_offset ) ! (if eliminate get_lon_xxx, can also deallocate ! clat_p_idx, and grid_latlon?)) ! physgrid_set = .true. ! Set flag indicating physics grid is now set ! if (masterproc) then write(iulog,*) 'PHYS_GRID_INIT: Using PCOLS=',pcols, & ' phys_loadbalance=',lbal_opt, & ' phys_twin_algorithm=',twin_alg, & ' phys_alltoall=',phys_alltoall, & ' chunks_per_thread=',chunks_per_thread endif ! call t_stopf("phys_grid_init") call t_adj_detailf(+2) call print_memusage('phys_grid_init: end') return end subroutine phys_grid_init !======================================================================== subroutine phys_grid_find_col(lat, lon, owner, lcid, icol) 1,1 !----------------------------------------------------------------------- ! ! Purpose: Find the global column closest to the point specified by lat ! and lon. Return indices of owning process, local chunk, and ! column. ! ! Authors: Phil Rasch / Patrick Worley / B. Eaton ! !----------------------------------------------------------------------- real(r8), intent(in) :: lat, lon ! requested location in degrees integer, intent(out) :: owner ! rank of chunk owner integer, intent(out) :: lcid ! local chunk index integer, intent(out) :: icol ! column index within the chunk ! local real(r8) dist2 ! the distance (in radians**2 from lat, lon) real(r8) distmin ! the distance (in radians**2 from closest column) real(r8) latr, lonr ! lat, lon (in radians) of requested location real(r8) clat, clon ! lat, lon (in radians) of column being tested real(r8) const integer i integer cid !----------------------------------------------------------------------- ! Check that input lat and lon are in valid range if (lon < 0.0 .or. lon >= 360. .or. & lat < -90. .or. lat > 90.) then if (masterproc) then write(iulog,*) & 'phys_grid_find_col: ERROR: lon must satisfy 0.<=lon<360. and lat must satisfy -90<=lat<=90.' write(iulog,*) & 'input lon=', lon, ' input lat=', lat endif call endrun('phys_grid_find_col: input ERROR') end if const = 180._r8/pi ! degrees per radian latr = lat/const ! to radians lonr = lon/const ! to radians owner = -999 lcid = -999 icol = -999 distmin = 1.e10 ! scan all chunks for closest point to lat, lon do cid = 1, nchunks do i = 1, chunks(cid)%ncols clat = clat_p(chunks(cid)%lat(i)) clon = clon_p(chunks(cid)%lon(i)) dist2 = (clat-latr)**2 + (clon-lonr)**2 if (dist2 < distmin ) then distmin = dist2 owner = chunks(cid)%owner lcid = chunks(cid)%lcid icol = i endif enddo end do end subroutine phys_grid_find_col ! !======================================================================== logical function phys_grid_initialized () !----------------------------------------------------------------------- ! ! Purpose: Identify whether phys_grid has been called yet or not ! ! Method: Return physgrid_set ! ! Author: Pat Worley ! !----------------------------------------------------------------------- !----------------------------------------------------------------------- ! phys_grid_initialized = physgrid_set ! return end function phys_grid_initialized ! !======================================================================== ! subroutine phys_grid_defaultopts(phys_loadbalance_out, & 1,2 phys_twin_algorithm_out, & phys_alltoall_out, & phys_chnk_per_thd_out ) !----------------------------------------------------------------------- ! Purpose: Return default runtime options ! Author: Tom Henderson !----------------------------------------------------------------------- use dycore, only: dycore_is !------------------------------Arguments-------------------------------- ! physics load balancing option integer, intent(out), optional :: phys_loadbalance_out ! algorithm to use when determining column pairs to assign to chunks integer, intent(out), optional :: phys_twin_algorithm_out ! alltoall option integer, intent(out), optional :: phys_alltoall_out ! number of chunks per thread integer, intent(out), optional :: phys_chnk_per_thd_out !----------------------------------------------------------------------- if ( present(phys_loadbalance_out) ) then phys_loadbalance_out = def_lbal_opt endif if ( present(phys_twin_algorithm_out) ) then if (dycore_is('UNSTRUCTURED')) then phys_twin_algorithm_out = def_twin_alg_unstructured else phys_twin_algorithm_out = def_twin_alg_lonlat endif endif if ( present(phys_alltoall_out) ) then phys_alltoall_out = def_alltoall endif if ( present(phys_chnk_per_thd_out) ) then phys_chnk_per_thd_out = def_chunks_per_thread endif end subroutine phys_grid_defaultopts ! !======================================================================== ! subroutine phys_grid_setopts(phys_loadbalance_in, & 1,6 phys_twin_algorithm_in, & phys_alltoall_in, & phys_chnk_per_thd_in ) !----------------------------------------------------------------------- ! Purpose: Set runtime options ! Author: Tom Henderson !----------------------------------------------------------------------- use spmd_utils, only: phys_mirror_decomp_req #if defined(MODCM_DP_TRANSPOSE) use mod_comm, only: phys_transpose_mod #endif !------------------------------Arguments-------------------------------- ! physics load balancing option integer, intent(in), optional :: phys_loadbalance_in ! option to use load balanced column pairs integer, intent(in), optional :: phys_twin_algorithm_in ! alltoall option integer, intent(in), optional :: phys_alltoall_in ! number of chunks per thread integer, intent(in), optional :: phys_chnk_per_thd_in !----------------------------------------------------------------------- if ( present(phys_loadbalance_in) ) then lbal_opt = phys_loadbalance_in if ((lbal_opt < min_lbal_opt).or.(lbal_opt > max_lbal_opt)) then if (masterproc) then write(iulog,*) & 'PHYS_GRID_SETOPTS: ERROR: phys_loadbalance=', & phys_loadbalance_in, & ' is out of range. It must be between ', & min_lbal_opt,' and ',max_lbal_opt endif call endrun endif if (lbal_opt .eq. 3) then phys_mirror_decomp_req = .true. else phys_mirror_decomp_req = .false. endif endif ! if ( present(phys_twin_algorithm_in) ) then twin_alg = phys_twin_algorithm_in if ((twin_alg < min_twin_alg).or.(twin_alg > max_twin_alg)) then if (masterproc) then write(iulog,*) & 'PHYS_GRID_SETOPTS: ERROR: phys_twin_algorithm=', & phys_twin_algorithm_in, & ' is out of range. It must be between ', & min_twin_alg,' and ',max_twin_alg endif call endrun endif endif ! if ( present(phys_alltoall_in) ) then phys_alltoall = phys_alltoall_in if (((phys_alltoall .lt. min_alltoall) .or. & (phys_alltoall .gt. max_alltoall)) & # if defined(MODCM_DP_TRANSPOSE) .and. & ((phys_alltoall .lt. modmin_alltoall) .or. & (phys_alltoall .gt. modmax_alltoall)) & # endif ) then if (masterproc) then write(iulog,*) & 'PHYS_GRID_SET_OPTS: ERROR: phys_alltoall=', & phys_alltoall_in, & ' is out of range. It must be between ', & min_alltoall,' and ',max_alltoall endif call endrun endif #if defined(SPMD) # if defined(MODCM_DP_TRANSPOSE) phys_transpose_mod = phys_alltoall # endif #endif endif ! if ( present(phys_chnk_per_thd_in) ) then chunks_per_thread = phys_chnk_per_thd_in if (chunks_per_thread < min_chunks_per_thread) then if (masterproc) then write(iulog,*) & 'PHYS_GRID_SETOPTS: ERROR: phys_chnk_per_thd=',& phys_chnk_per_thd_in, & ' is too small. It must not be smaller than ', & min_chunks_per_thread endif call endrun endif endif end subroutine phys_grid_setopts ! !======================================================================== ! subroutine get_chunk_indices_p(index_beg, index_end) !----------------------------------------------------------------------- ! ! Purpose: Return range of indices for local chunks ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- !------------------------------Arguments-------------------------------- integer, intent(out) :: index_beg ! first index used for local chunks integer, intent(out) :: index_end ! last index used for local chunks !----------------------------------------------------------------------- index_beg = begchunk index_end = endchunk return end subroutine get_chunk_indices_p ! !======================================================================== ! subroutine get_gcol_all_p(lcid, latdim, gcols) 2 !----------------------------------------------------------------------- ! ! Purpose: Return all global column indices for chunk ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: latdim ! declared size of output array integer, intent(out) :: gcols(:) ! array of global latitude indices !---------------------------Local workspace----------------------------- integer :: i ! loop index !----------------------------------------------------------------------- gcols=-1 do i=1,lchunks(lcid)%ncols gcols(i) = lchunks(lcid)%gcol(i) enddo return end subroutine get_gcol_all_p ! !======================================================================== ! integer function get_gcol_p(lcid, col) 1 !----------------------------------------------------------------------- ! ! Purpose: Return global physics column index for chunk column ! ! Method: ! ! Author: Jim Edwards / Patrick Worley ! !----------------------------------------------------------------------- !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: col ! column index !----------------------------------------------------------------------- get_gcol_p = lchunks(lcid)%gcol(col) return end function get_gcol_p ! !======================================================================== subroutine get_gcol_vec_p(lcid, lth, cols, gcols),1 !----------------------------------------------------------------------- ! ! Purpose: Return global physics column indices for set of chunk columns ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: lth ! number of column indices integer, intent(in) :: cols(lth) ! column indices integer, intent(out) :: gcols(lth) ! array of global physics ! columns indices !---------------------------Local workspace----------------------------- integer :: i ! loop index !----------------------------------------------------------------------- do i=1,lth gcols(i) = lchunks(lcid)%gcol(cols(i)) enddo return end subroutine get_gcol_vec_p ! !======================================================================== ! integer function get_ncols_p(lcid) 41 !----------------------------------------------------------------------- ! ! Purpose: Return number of columns in chunk given the local chunk id. ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id !---------------------------Local workspace----------------------------- integer :: cid ! global chunk id !----------------------------------------------------------------------- get_ncols_p = lchunks(lcid)%ncols return end function get_ncols_p ! !======================================================================== ! subroutine get_lat_all_p(lcid, latdim, lats) 13,1 !----------------------------------------------------------------------- ! ! Purpose: Return all global latitude indices for chunk ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: latdim ! declared size of output array integer, intent(out) :: lats(latdim) ! array of global latitude indices !---------------------------Local workspace----------------------------- integer :: i ! loop index integer :: cid ! global chunk id !----------------------------------------------------------------------- cid = lchunks(lcid)%cid do i=1,chunks(cid)%ncols lats(i) = chunks(cid)%lat(i) enddo return end subroutine get_lat_all_p ! !======================================================================== subroutine get_lat_vec_p(lcid, lth, cols, lats),1 !----------------------------------------------------------------------- ! ! Purpose: Return global latitude indices for set of chunk columns ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: lth ! number of column indices integer, intent(in) :: cols(lth) ! column indices integer, intent(out) :: lats(lth) ! array of global latitude indices !---------------------------Local workspace----------------------------- integer :: i ! loop index integer :: cid ! global chunk id !----------------------------------------------------------------------- cid = lchunks(lcid)%cid do i=1,lth lats(i) = chunks(cid)%lat(cols(i)) enddo return end subroutine get_lat_vec_p ! !======================================================================== integer function get_lat_p(lcid, col) 3,1 !----------------------------------------------------------------------- ! ! Purpose: Return global latitude index for chunk column ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: col ! column index !---------------------------Local workspace----------------------------- integer :: cid ! global chunk id !----------------------------------------------------------------------- cid = lchunks(lcid)%cid get_lat_p = chunks(cid)%lat(col) return end function get_lat_p ! !======================================================================== ! subroutine get_lon_all_p(lcid, londim, lons) 8,1 !----------------------------------------------------------------------- ! ! Purpose: ! Was: Return all global longitude indices for chunk ! Now: Return all longitude offsets (+1) for chunk. These are offsets ! in ordered list of global columns from first ! column with given latitude to column with given latitude ! and longitude. This corresponds to the usual longitude indices ! for full and reduced lon/lat grids. ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: londim ! declared size of output array integer, intent(out) :: lons(londim) ! array of global longitude ! indices !---------------------------Local workspace----------------------------- integer :: i ! loop index integer :: lat ! latitude index integer :: cid ! global chunk id integer :: gcol ! global column id in latlon ! ordering !----------------------------------------------------------------------- cid = lchunks(lcid)%cid do i=1,chunks(cid)%ncols lat = chunks(cid)%lat(i) gcol = dyn_to_latlon_gcol_map(chunks(cid)%gcol(i)) lons(i) = (gcol - clat_p_idx(lat)) + 1 enddo return end subroutine get_lon_all_p ! !======================================================================== subroutine get_lon_vec_p(lcid, lth, cols, lons),1 !----------------------------------------------------------------------- ! ! Purpose: ! Was: Return global longitude indices for set of chunk columns. ! Now: Return longitude offsets (+1) for set of chunk columns. ! These are offsets in ordered list of global columns from first ! column with given latitude to column with given latitude ! and longitude. This corresponds to the usual longitude indices ! for full and reduced lon/lat grids. ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: lth ! number of column indices integer, intent(in) :: cols(lth) ! column indices integer, intent(out) :: lons(lth) ! array of global longitude indices !---------------------------Local workspace----------------------------- integer :: i ! loop index integer :: lat ! latitude index integer :: cid ! global chunk id integer :: gcol ! global column id in latlon ! ordering !----------------------------------------------------------------------- cid = lchunks(lcid)%cid do i=1,lth lat = chunks(cid)%lat(cols(i)) gcol = dyn_to_latlon_gcol_map(chunks(cid)%gcol(i)) lons(i) = (gcol - clat_p_idx(lat)) + 1 enddo return end subroutine get_lon_vec_p ! !======================================================================== integer function get_lon_p(lcid, col) 3,1 !----------------------------------------------------------------------- ! ! Purpose: ! Was: Return global longitude index for chunk column. ! Now: Return longitude offset (+1) for chunk column. This is the ! offset in ordered list of global columns from first ! column with given latitude to column with given latitude ! and longitude. This corresponds to the usual longitude index ! for full and reduced lon/lat grids. ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: col ! column index !---------------------------Local workspace----------------------------- integer :: cid ! global chunk id integer :: lat ! latitude index integer :: gcol ! global column id in latlon ! ordering !----------------------------------------------------------------------- cid = lchunks(lcid)%cid lat = chunks(cid)%lat(col) gcol = dyn_to_latlon_gcol_map(chunks(cid)%gcol(col)) get_lon_p = (gcol - clat_p_idx(lat)) + 1 return end function get_lon_p ! !======================================================================== ! subroutine get_rlat_all_p(lcid, rlatdim, rlats) 14,1 !----------------------------------------------------------------------- ! ! Purpose: Return all latitudes (in radians) for chunk ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: rlatdim ! declared size of output array real(r8), intent(out) :: rlats(rlatdim)! array of latitudes !---------------------------Local workspace----------------------------- integer :: i ! loop index integer :: cid ! global chunk id !----------------------------------------------------------------------- cid = lchunks(lcid)%cid do i=1,chunks(cid)%ncols rlats(i) = clat_p(chunks(cid)%lat(i)) enddo return end subroutine get_rlat_all_p ! !======================================================================== ! subroutine get_area_all_p(lcid, rdim, area) 1,1 !----------------------------------------------------------------------- ! ! Purpose: Return all areas for chunk ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: rdim ! declared size of output array real(r8), intent(out) :: area(rdim) ! array of areas !---------------------------Local workspace----------------------------- integer :: i ! loop index !----------------------------------------------------------------------- do i=1,lchunks(lcid)%ncols area(i) = lchunks(lcid)%area(i) enddo return end subroutine get_area_all_p ! !======================================================================== ! real(r8) function get_area_p(lcid, col),1 !----------------------------------------------------------------------- ! ! Purpose: Return area for chunk column ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: col ! column index !----------------------------------------------------------------------- get_area_p = lchunks(lcid)%area(col) return end function get_area_p ! !======================================================================== ! subroutine get_wght_all_p(lcid, rdim, wght) 1,1 !----------------------------------------------------------------------- ! ! Purpose: Return all integration weights for chunk ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: rdim ! declared size of output array real(r8), intent(out) :: wght(rdim) ! array of integration weights !---------------------------Local workspace----------------------------- integer :: i ! loop index !----------------------------------------------------------------------- do i=1,lchunks(lcid)%ncols wght(i) = lchunks(lcid)%wght(i) enddo return end subroutine get_wght_all_p ! !======================================================================== ! real(r8) function get_wght_p(lcid, col),1 !----------------------------------------------------------------------- ! ! Purpose: Return integration weight for chunk column ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: col ! column index !----------------------------------------------------------------------- get_wght_p = lchunks(lcid)%wght(col) return end function get_wght_p ! !======================================================================== ! subroutine get_rlat_vec_p(lcid, lth, cols, rlats),1 !----------------------------------------------------------------------- ! ! Purpose: Return latitudes (in radians) for set of chunk columns ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: lth ! number of column indices integer, intent(in) :: cols(lth) ! column indices real(r8), intent(out) :: rlats(lth) ! array of latitudes !---------------------------Local workspace----------------------------- integer :: i ! loop index integer :: cid ! global chunk id !----------------------------------------------------------------------- cid = lchunks(lcid)%cid do i=1,lth rlats(i) = clat_p(chunks(cid)%lat(cols(i))) enddo return end subroutine get_rlat_vec_p ! !======================================================================== real(r8) function get_rlat_p(lcid, col) 2,1 !----------------------------------------------------------------------- ! ! Purpose: Return latitude (in radians) for chunk column ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: col ! column index !---------------------------Local workspace----------------------------- integer :: cid ! global chunk id !----------------------------------------------------------------------- cid = lchunks(lcid)%cid get_rlat_p = clat_p(chunks(cid)%lat(col)) return end function get_rlat_p ! !======================================================================== ! subroutine get_rlon_all_p(lcid, rlondim, rlons) 9,1 !----------------------------------------------------------------------- ! ! Purpose: Return all longitudes (in radians) for chunk ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: rlondim ! declared size of output array real(r8), intent(out) :: rlons(rlondim)! array of longitudes !---------------------------Local workspace----------------------------- integer :: i ! loop index integer :: cid ! global chunk id !----------------------------------------------------------------------- cid = lchunks(lcid)%cid do i=1,chunks(cid)%ncols rlons(i) = clon_p(chunks(cid)%lon(i)) enddo return end subroutine get_rlon_all_p ! !======================================================================== subroutine get_rlon_vec_p(lcid, lth, cols, rlons),1 !----------------------------------------------------------------------- ! ! Purpose: Return longitudes (in radians) for set of chunk columns ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: lth ! number of column indices integer, intent(in) :: cols(lth) ! column indices real(r8), intent(out) :: rlons(lth) ! array of longitudes !---------------------------Local workspace----------------------------- integer :: i ! loop index integer :: cid ! global chunk id !----------------------------------------------------------------------- cid = lchunks(lcid)%cid do i=1,lth rlons(i) = clon_p(chunks(cid)%lon(cols(i))) enddo return end subroutine get_rlon_vec_p ! !======================================================================== real(r8) function get_rlon_p(lcid, col) 2,1 !----------------------------------------------------------------------- ! ! Purpose: Return longitude (in radians) for chunk column ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use ppgrid !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: col ! column index !---------------------------Local workspace----------------------------- integer :: cid ! global chunk id !----------------------------------------------------------------------- cid = lchunks(lcid)%cid get_rlon_p = clon_p(chunks(cid)%lon(col)) return end function get_rlon_p ! !======================================================================== ! ! integer function get_gcol_owner_p(gcol) !----------------------------------------------------------------------- ! ! Purpose: Return owner of physics column with indicate index ! ! Method: ! ! Author: P. Worley ! !----------------------------------------------------------------------- !------------------------------Arguments-------------------------------- ! integer, intent(in) :: gcol ! physics column index ! !----------------------------------------------------------------------- ! ! get_gcol_owner_p = chunks(knuhcs(gcol)%chunkid)%owner ! ! return ! end function get_gcol_owner_p ! !======================================================================== ! subroutine buff_to_chunk(fdim,mdim,lbuff,localchunks) !----------------------------------------------------------------------- ! ! Purpose: Copy from local buffer ! to local chunk data structure. ! Needed for cpl6. ! ! Method: ! ! Author: Pat Worley and Robert Jacob ! !----------------------------------------------------------------------- !------------------------------Arguments-------------------------------- ! integer, intent(in) :: fdim ! declared length of first lbuff dimension ! integer, intent(in) :: mdim ! declared length of middle lbuff dimension ! real(r8), intent(in) :: lbuff(fdim, mdim) ! local lon/lat buffer ! ! real(r8), intent(out):: localchunks(pcols,mdim,begchunk:endchunk) ! local chunks ! ! !---------------------------Local workspace----------------------------- ! integer :: i,j,m,n ! loop indices ! ! integer, save :: numcols = 0 ! integer, allocatable, save :: columnid(:), chunkid(:) !----------------------------------------------------------------------- ! ! if (numcols .eq. 0) then ! n = 0 ! do i=1,ngcols ! if (dyn_to_latlon_gcol_map(i) .ne. -1) then ! if(chunks(knuhcs(i)%chunkid)%owner .eq. iam) then ! n = n + 1 ! endif ! endif ! enddo ! allocate(columnid(1:n)) ! allocate(chunkid(1:n)) ! ! n = 0 ! do i=1,ngcols ! if (dyn_to_latlon_gcol_map(i) .ne. -1) then ! if(chunks(knuhcs(i)%chunkid)%owner .eq. iam) then ! n = n + 1 ! columnid(n) = knuhcs(i)%col ! chunkid(n) = chunks(knuhcs(i)%chunkid)%lcid ! endif ! endif ! end do ! ! numcols = n ! endif ! ! if (numcols .gt. fdim) call endrun('buff_to_chunk') ! do m=1,mdim !dir$ concurrent !dir$ prefervector, preferstream ! do n = 1, numcols ! localchunks(columnid(n),m,chunkid(n)) = lbuff(n,m) ! end do ! end do ! ! return ! end subroutine buff_to_chunk ! !======================================================================== subroutine scatter_field_to_chunk(fdim,mdim,ldim, & 7,2 hdim1d,globalfield,localchunks) !----------------------------------------------------------------------- ! ! Purpose: Distribute field ! to decomposed chunk data structure ! ! Method: ! ! Author: Patrick Worley ! !------------------------------Arguments-------------------------------- integer, intent(in) :: fdim ! declared length of first dimension integer, intent(in) :: mdim ! declared length of middle dimension integer, intent(in) :: ldim ! declared length of last dimension integer, intent(in) :: hdim1d ! declared first horizontal index ! dimension real(r8), intent(in) :: globalfield(fdim,hdim1d,mdim,hdim2_d,ldim) ! global field real(r8), intent(out):: localchunks(fdim,pcols,mdim, & begchunk:endchunk,ldim) ! local chunks !---------------------------Local workspace----------------------------- integer :: f,i,m,l,p ! loop indices integer :: cid ! global chunk id integer :: lcid ! local chunk id integer :: lid ! local column index integer :: gcol ! global column index integer :: h1 ! first horizontal dimension index integer :: h2 ! second horizontal dimension index #if ( defined SPMD ) real(r8) gfield_p(fdim,mdim,ldim,ngcols) ! vector to be scattered real(r8) lfield_p(fdim,mdim,ldim,nlcols) ! local component of scattered ! vector integer :: displs(0:npes-1) ! scatter displacements integer :: sndcnts(0:npes-1) ! scatter send counts integer :: recvcnt ! scatter receive count integer :: beglcol ! beginning index for local columns ! in global column ordering #endif !----------------------------------------------------------------------- if (hdim1d < hdim1_d) then write(iulog,*) __FILE__,__LINE__,hdim1d,hdim1_d call endrun ('SCATTER_FIELD_TO_CHUNK error: hdim1d < hdim1_d') endif localchunks(:,:,:,:,:) = 0 #if ( defined SPMD ) displs(0) = 0 sndcnts(0) = fdim*mdim*ldim*gs_col_num(0) beglcol = 0 do p=1,npes-1 displs(p) = displs(p-1) + sndcnts(p-1) sndcnts(p) = fdim*mdim*ldim*gs_col_num(p) if (p <= iam) then beglcol = beglcol + gs_col_num(p-1) endif enddo recvcnt = fdim*mdim*ldim*nlcols if (masterproc) then ! copy field into global (process-ordered) chunked data structure do l=1,ldim !DIR$ PREFERVECTOR !DIR$ PREFERSTREAM !DIR$ CONCURRENT do i=1,ngcols_p cid = pgcols(i)%chunk lid = pgcols(i)%ccol gcol = chunks(cid)%gcol(lid) h2 = (gcol-1)/hdim1_d + 1 h1 = mod((gcol-1),hdim1_d) + 1 do m=1,mdim do f=1,fdim gfield_p(f,m,l,i) = & globalfield(f, h1, m, h2, l) end do end do end do end do endif ! scatter to other processes ! (pgcols ordering consistent with begchunk:endchunk ! local ordering) call t_barrierf('sync_scat_ftoc', mpicom) call mpiscatterv(gfield_p, sndcnts, displs, mpir8, & lfield_p, recvcnt, mpir8, 0, mpicom) ! copy into local chunked data structure !DIR$ PREFERVECTOR !DIR$ PREFERSTREAM !DIR$ CONCURRENT do i=1,nlcols cid = pgcols(beglcol+i)%chunk lcid = chunks(cid)%lcid lid = pgcols(beglcol+i)%ccol do l=1,ldim do m=1,mdim do f=1,fdim localchunks(f,lid,m,lcid,l) = & lfield_p(f, m, l, i) end do end do end do end do #else ! copy field into chunked data structure ! (pgcol ordering chosen to reflect begchunk:endchunk ! local ordering) do l=1,ldim !DIR$ PREFERVECTOR !DIR$ PREFERSTREAM !DIR$ CONCURRENT do i=1,ngcols_p cid = pgcols(i)%chunk lcid = chunks(cid)%lcid lid = pgcols(i)%ccol gcol = chunks(cid)%gcol(lid) h2 = (gcol-1)/hdim1_d + 1 h1 = mod((gcol-1),hdim1_d) + 1 do m=1,mdim do f=1,fdim localchunks(f,lid,m,lcid,l) = & globalfield(f, h1, m, h2, l) end do end do end do end do #endif return end subroutine scatter_field_to_chunk !======================================================================== subroutine scatter_field_to_chunk4(fdim,mdim,ldim, &,2 hdim1d,globalfield,localchunks) !----------------------------------------------------------------------- ! ! Purpose: Distribute field ! to decomposed chunk data structure ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- !------------------------------Arguments-------------------------------- integer, intent(in) :: fdim ! declared length of first dimension integer, intent(in) :: mdim ! declared length of middle dimension integer, intent(in) :: ldim ! declared length of last dimension integer, intent(in) :: hdim1d ! declared first horizontal index ! dimension real(r4), intent(in) :: globalfield(fdim,hdim1d,mdim,hdim2_d,ldim) ! global field real(r4), intent(out):: localchunks(fdim,pcols,mdim, & begchunk:endchunk,ldim) ! local chunks !---------------------------Local workspace----------------------------- integer :: f,i,m,l,p ! loop indices integer :: cid ! global chunk id integer :: lcid ! local chunk id integer :: lid ! local column index integer :: gcol ! global column index integer :: h1 ! first horizontal dimension index integer :: h2 ! second horizontal dimension index #if ( defined SPMD ) real(r4) gfield_p(fdim,mdim,ldim,ngcols) ! vector to be scattered real(r4) lfield_p(fdim,mdim,ldim,nlcols) ! local component of scattered ! vector integer :: displs(0:npes-1) ! scatter displacements integer :: sndcnts(0:npes-1) ! scatter send counts integer :: recvcnt ! scatter receive count integer :: beglcol ! beginning index for local columns ! in global column ordering #endif !----------------------------------------------------------------------- if (hdim1d < hdim1_d) then call endrun ('SCATTER_FIELD_TO_CHUNK4 error: hdim1d < hdim1_d') endif #if ( defined SPMD ) displs(0) = 0 sndcnts(0) = fdim*mdim*ldim*gs_col_num(0) beglcol = 0 do p=1,npes-1 displs(p) = displs(p-1) + sndcnts(p-1) sndcnts(p) = fdim*mdim*ldim*gs_col_num(p) if (p <= iam) then beglcol = beglcol + gs_col_num(p-1) endif enddo recvcnt = fdim*mdim*ldim*nlcols if (masterproc) then ! copy field into global (process-ordered) chunked data structure do l=1,ldim !DIR$ PREFERVECTOR !DIR$ PREFERSTREAM !DIR$ CONCURRENT do i=1,ngcols_p cid = pgcols(i)%chunk lid = pgcols(i)%ccol gcol = chunks(cid)%gcol(lid) h2 = (gcol-1)/hdim1_d + 1 h1 = mod((gcol-1),hdim1_d) + 1 do m=1,mdim do f=1,fdim gfield_p(f,m,l,i) = & globalfield(f, h1, m, h2, l) end do end do end do end do endif ! scatter to other processes ! (pgcols ordering consistent with begchunk:endchunk ! local ordering) call t_barrierf('sync_scat_ftoc', mpicom) call mpiscatterv(gfield_p, sndcnts, displs, mpir4, & lfield_p, recvcnt, mpir4, 0, mpicom) ! copy into local chunked data structure !DIR$ PREFERVECTOR !DIR$ PREFERSTREAM !DIR$ CONCURRENT do i=1,nlcols cid = pgcols(beglcol+i)%chunk lcid = chunks(cid)%lcid lid = pgcols(beglcol+i)%ccol do l=1,ldim do m=1,mdim do f=1,fdim localchunks(f,lid,m,lcid,l) = & lfield_p(f, m, l, i) end do end do end do end do #else ! copy field into chunked data structure ! (pgcol ordering chosen to reflect begchunk:endchunk ! local ordering) do l=1,ldim !DIR$ PREFERVECTOR !DIR$ PREFERSTREAM !DIR$ CONCURRENT do i=1,ngcols_p cid = pgcols(i)%chunk lcid = chunks(cid)%lcid lid = pgcols(i)%ccol gcol = chunks(cid)%gcol(lid) h2 = (gcol-1)/hdim1_d + 1 h1 = mod((gcol-1),hdim1_d) + 1 do m=1,mdim do f=1,fdim localchunks(f,lid,m,lcid,l) = & globalfield(f, h1, m, h2, l) end do end do end do end do #endif return end subroutine scatter_field_to_chunk4 !======================================================================== subroutine scatter_field_to_chunk_int(fdim,mdim,ldim, &,2 hdim1d,globalfield,localchunks) !----------------------------------------------------------------------- ! ! Purpose: Distribute field ! to decomposed chunk data structure ! ! Method: ! ! Author: Patrick Worley ! !------------------------------Arguments-------------------------------- integer, intent(in) :: fdim ! declared length of first dimension integer, intent(in) :: mdim ! declared length of middle dimension integer, intent(in) :: ldim ! declared length of last dimension integer, intent(in) :: hdim1d ! declared first horizontal index ! dimension integer, intent(in) :: globalfield(fdim,hdim1d,mdim,hdim2_d,ldim) ! global field integer, intent(out):: localchunks(fdim,pcols,mdim, & begchunk:endchunk,ldim) ! local chunks !---------------------------Local workspace----------------------------- integer :: f,i,m,l,p ! loop indices integer :: cid ! global chunk id integer :: lcid ! local chunk id integer :: lid ! local column index integer :: gcol ! global column index integer :: h1 ! first horizontal dimension index integer :: h2 ! second horizontal dimension index #if ( defined SPMD ) integer gfield_p(fdim,mdim,ldim,ngcols) ! vector to be scattered integer lfield_p(fdim,mdim,ldim,nlcols) ! local component of scattered ! vector integer :: displs(0:npes-1) ! scatter displacements integer :: sndcnts(0:npes-1) ! scatter send counts integer :: recvcnt ! scatter receive count integer :: beglcol ! beginning index for local columns ! in global column ordering #endif !----------------------------------------------------------------------- if (hdim1d < hdim1_d) then call endrun ('SCATTER_FIELD_TO_CHUNK_INT error: hdim1d < hdim1_d') endif #if ( defined SPMD ) displs(0) = 0 sndcnts(0) = fdim*mdim*ldim*gs_col_num(0) beglcol = 0 do p=1,npes-1 displs(p) = displs(p-1) + sndcnts(p-1) sndcnts(p) = fdim*mdim*ldim*gs_col_num(p) if (p <= iam) then beglcol = beglcol + gs_col_num(p-1) endif enddo recvcnt = fdim*mdim*ldim*nlcols if (masterproc) then ! copy field into global (process-ordered) chunked data structure do l=1,ldim !DIR$ PREFERVECTOR !DIR$ PREFERSTREAM !DIR$ CONCURRENT do i=1,ngcols_p cid = pgcols(i)%chunk lid = pgcols(i)%ccol gcol = chunks(cid)%gcol(lid) h2 = (gcol-1)/hdim1_d + 1 h1 = mod((gcol-1),hdim1_d) + 1 do m=1,mdim do f=1,fdim gfield_p(f,m,l,i) = & globalfield(f, h1, m, h2, l) end do end do end do end do endif ! scatter to other processes ! (pgcols ordering consistent with begchunk:endchunk ! local ordering) call t_barrierf('sync_scat_ftoc', mpicom) call mpiscatterv(gfield_p, sndcnts, displs, mpiint, & lfield_p, recvcnt, mpiint, 0, mpicom) ! copy into local chunked data structure !DIR$ PREFERVECTOR !DIR$ PREFERSTREAM !DIR$ CONCURRENT do i=1,nlcols cid = pgcols(beglcol+i)%chunk lcid = chunks(cid)%lcid lid = pgcols(beglcol+i)%ccol do l=1,ldim do m=1,mdim do f=1,fdim localchunks(f,lid,m,lcid,l) = & lfield_p(f, m, l, i) end do end do end do end do #else ! copy field into chunked data structure ! (pgcol ordering chosen to reflect begchunk:endchunk ! local ordering) do l=1,ldim !DIR$ PREFERVECTOR !DIR$ PREFERSTREAM !DIR$ CONCURRENT do i=1,ngcols_p cid = pgcols(i)%chunk lcid = chunks(cid)%lcid lid = pgcols(i)%ccol gcol = chunks(cid)%gcol(lid) h2 = (gcol-1)/hdim1_d + 1 h1 = mod((gcol-1),hdim1_d) + 1 do m=1,mdim do f=1,fdim localchunks(f,lid,m,lcid,l) = & globalfield(f, h1, m, h2, l) end do end do end do end do #endif return end subroutine scatter_field_to_chunk_int ! !======================================================================== ! ! subroutine chunk_to_buff(fdim,mdim,localchunks,lbuff) ! !----------------------------------------------------------------------- ! ! Purpose: Copy from local chunk data structure ! to local buffer. Needed for cpl6. ! (local = assigned to same process) ! ! Method: ! ! Author: Pat Worley and Robert Jacob !----------------------------------------------------------------------- !------------------------------Arguments-------------------------------- ! integer, intent(in) :: fdim ! declared length of first lbuff dimension ! integer, intent(in) :: mdim ! declared length of middle lbuff dimension ! real(r8), intent(in):: localchunks(pcols,mdim, begchunk:endchunk) ! local chunks ! ! real(r8), intent(out) :: lbuff(fdim,mdim) ! local buff ! !---------------------------Local workspace----------------------------- ! integer :: i,j,m,n ! loop indices ! ! integer, save :: numcols = 0 ! integer, allocatable, save :: columnid(:), chunkid(:) !----------------------------------------------------------------------- ! ! if (numcols .eq. 0) then ! n = 0 ! do i=1,ngcols ! if (dyn_to_latlon_gcol_map(i) .ne. -1) then ! if(chunks(knuhcs(i)%chunkid)%owner .eq. iam) then ! n = n + 1 ! endif ! endif ! enddo ! allocate(columnid(1:n)) ! allocate(chunkid(1:n)) ! ! n = 0 ! do i=1,ngcols ! if (dyn_to_latlon_gcol_map(i) .ne. -1) then ! if(chunks(knuhcs(i)%chunkid)%owner .eq. iam) then ! n = n + 1 ! columnid(n) = knuhcs(i)%col ! chunkid(n) = chunks(knuhcs(i)%chunkid)%lcid ! endif ! endif ! end do ! ! numcols = n ! endif ! ! if (numcols .gt. fdim) call endrun('chunk_to_buff') ! do m=1,mdim !dir$ concurrent !dir$ prefervector, preferstream ! do n = 1, numcols ! lbuff(n,m) = localchunks(columnid(n),m,chunkid(n)) ! end do ! end do ! ! return ! end subroutine chunk_to_buff ! ! !======================================================================== ! subroutine gather_chunk_to_field(fdim,mdim,ldim, & 2,4 hdim1d,localchunks,globalfield) !----------------------------------------------------------------------- ! ! Purpose: Reconstruct field ! from decomposed chunk data structure ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- #if ( defined SPMD ) use spmd_utils, only: fc_gatherv #endif !------------------------------Arguments-------------------------------- integer, intent(in) :: fdim ! declared length of first dimension integer, intent(in) :: mdim ! declared length of middle dimension integer, intent(in) :: ldim ! declared length of last dimension integer, intent(in) :: hdim1d ! declared first horizontal index ! dimension real(r8), intent(in):: localchunks(fdim,pcols,mdim, & begchunk:endchunk,ldim) ! local chunks real(r8), intent(out) :: globalfield(fdim,hdim1d,mdim,hdim2_d,ldim) ! global field !---------------------------Local workspace----------------------------- integer :: f,i,m,l,p ! loop indices integer :: cid ! global chunk id integer :: lcid ! local chunk id integer :: lid ! local column index integer :: gcol ! global column index integer :: h1 ! first horizontal dimension index integer :: h2 ! second horizontal dimension index #if ( defined SPMD ) real(r8) gfield_p(fdim,mdim,ldim,ngcols) ! vector to be gathered real(r8) lfield_p(fdim,mdim,ldim,nlcols) ! local component of gather ! vector integer :: displs(0:npes-1) ! gather displacements integer :: rcvcnts(0:npes-1) ! gather receive count integer :: sendcnt ! gather send counts integer :: beglcol ! beginning index for local columns ! in global column ordering #endif !----------------------------------------------------------------------- if (hdim1d < hdim1_d) then call endrun ('GATHER_CHUNK_TO_FIELD error: hdim1d < hdim1_d') endif #if ( defined SPMD ) displs(0) = 0 rcvcnts(0) = fdim*mdim*ldim*gs_col_num(0) beglcol = 0 do p=1,npes-1 displs(p) = displs(p-1) + rcvcnts(p-1) rcvcnts(p) = fdim*mdim*ldim*gs_col_num(p) if (p <= iam) then beglcol = beglcol + gs_col_num(p-1) endif enddo sendcnt = fdim*mdim*ldim*nlcols ! copy into local gather data structure do l=1,ldim !DIR$ PREFERVECTOR, PREFERSTREAM !DIR$ CONCURRENT do i=1,nlcols cid = pgcols(beglcol+i)%chunk lcid = chunks(cid)%lcid lid = pgcols(beglcol+i)%ccol do m=1,mdim do f=1,fdim lfield_p(f, m, l, i) = & localchunks(f,lid,m,lcid,l) end do end do end do end do ! gather from other processes call t_barrierf('sync_gath_ctof', mpicom) call fc_gatherv(lfield_p, sendcnt, mpir8, & gfield_p, rcvcnts, displs, mpir8, 0, mpicom) if (masterproc) then ! copy gathered columns into lon/lat field !DIR$ PREFERVECTOR, PREFERSTREAM !DIR$ CONCURRENT do i=1,ngcols_p cid = pgcols(i)%chunk lid = pgcols(i)%ccol gcol = chunks(cid)%gcol(lid) h2 = (gcol-1)/hdim1_d + 1 h1 = mod((gcol-1),hdim1_d) + 1 do l=1,ldim do m=1,mdim do f=1,fdim globalfield(f, h1, m, h2, l) & = gfield_p(f,m,l,i) end do end do end do end do endif call mpibarrier(mpicom) #else ! copy chunked data structure into dynamics field ! (pgcol ordering chosen to reflect begchunk:endchunk ! local ordering) do l=1,ldim !DIR$ PREFERVECTOR, PREFERSTREAM !DIR$ CONCURRENT do i=1,ngcols_p cid = pgcols(i)%chunk lcid = chunks(cid)%lcid lid = pgcols(i)%ccol gcol = chunks(cid)%gcol(lid) h2 = (gcol-1)/hdim1_d + 1 h1 = mod((gcol-1),hdim1_d) + 1 do m=1,mdim do f=1,fdim globalfield(f, h1, m, h2, l) & = localchunks(f,lid,m,lcid,l) end do end do end do end do #endif return end subroutine gather_chunk_to_field ! !======================================================================== ! subroutine gather_chunk_to_field4 (fdim,mdim,ldim, &,3 hdim1d,localchunks,globalfield) !----------------------------------------------------------------------- ! ! Purpose: Reconstruct field ! from decomposed chunk data structure ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- #if ( defined SPMD ) use spmd_utils, only: fc_gathervr4 #endif !------------------------------Arguments-------------------------------- integer, intent(in) :: fdim ! declared length of first dimension integer, intent(in) :: mdim ! declared length of middle dimension integer, intent(in) :: ldim ! declared length of last dimension integer, intent(in) :: hdim1d ! declared first horizontal index ! dimension real(r4), intent(in):: localchunks(fdim,pcols,mdim, & begchunk:endchunk,ldim) ! local chunks real(r4), intent(out) :: globalfield(fdim,hdim1d,mdim,hdim2_d,ldim) ! global field !---------------------------Local workspace----------------------------- integer :: f,i,m,l,p ! loop indices integer :: cid ! global chunk id integer :: lcid ! local chunk id integer :: lid ! local column index integer :: gcol ! global column index integer :: h1 ! first horizontal dimension index integer :: h2 ! second horizontal dimension index #if ( defined SPMD ) real(r4) gfield_p(fdim,mdim,ldim,ngcols) ! vector to be gathered real(r4) lfield_p(fdim,mdim,ldim,nlcols) ! local component of gather ! vector integer :: displs(0:npes-1) ! gather displacements integer :: rcvcnts(0:npes-1) ! gather receive count integer :: sendcnt ! gather send counts integer :: beglcol ! beginning index for local columns ! in global column ordering #endif !----------------------------------------------------------------------- if (hdim1d < hdim1_d) then call endrun ('GATHER_CHUNK_TO_FIELD4 error: hdim1d < hdim1_d') endif #if ( defined SPMD ) displs(0) = 0 rcvcnts(0) = fdim*mdim*ldim*gs_col_num(0) beglcol = 0 do p=1,npes-1 displs(p) = displs(p-1) + rcvcnts(p-1) rcvcnts(p) = fdim*mdim*ldim*gs_col_num(p) if (p <= iam) then beglcol = beglcol + gs_col_num(p-1) endif enddo sendcnt = fdim*mdim*ldim*nlcols ! copy into local gather data structure do l=1,ldim !DIR$ PREFERVECTOR, PREFERSTREAM !DIR$ CONCURRENT do i=1,nlcols cid = pgcols(beglcol+i)%chunk lcid = chunks(cid)%lcid lid = pgcols(beglcol+i)%ccol do m=1,mdim do f=1,fdim lfield_p(f, m, l, i) = & localchunks(f,lid,m,lcid,l) end do end do end do end do ! gather from other processes call t_barrierf('sync_gath_ctof', mpicom) call fc_gathervr4(lfield_p, sendcnt, mpir4, & gfield_p, rcvcnts, displs, mpir4, 0, mpicom) if (masterproc) then ! copy gathered columns into lon/lat field !DIR$ PREFERVECTOR, PREFERSTREAM !DIR$ CONCURRENT do i=1,ngcols_p cid = pgcols(i)%chunk lid = pgcols(i)%ccol gcol = chunks(cid)%gcol(lid) h2 = (gcol-1)/hdim1_d + 1 h1 = mod((gcol-1),hdim1_d) + 1 do l=1,ldim do m=1,mdim do f=1,fdim globalfield(f, h1, m, h2, l) & = gfield_p(f,m,l,i) end do end do end do end do endif #else ! copy chunked data structure into dynamics field ! (pgcol ordering chosen to reflect begchunk:endchunk ! local ordering) do l=1,ldim !DIR$ PREFERVECTOR, PREFERSTREAM !DIR$ CONCURRENT do i=1,ngcols_p cid = pgcols(i)%chunk lcid = chunks(cid)%lcid lid = pgcols(i)%ccol gcol = chunks(cid)%gcol(lid) h2 = (gcol-1)/hdim1_d + 1 h1 = mod((gcol-1),hdim1_d) + 1 do m=1,mdim do f=1,fdim globalfield(f, h1, m, h2, l) & = localchunks(f,lid,m,lcid,l) end do end do end do end do #endif return end subroutine gather_chunk_to_field4 ! !======================================================================== ! subroutine gather_chunk_to_field_int (fdim,mdim,ldim, &,3 hdim1d,localchunks,globalfield) !----------------------------------------------------------------------- ! ! Purpose: Reconstruct field ! from decomposed chunk data structure ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- #if ( defined SPMD ) use spmd_utils, only: fc_gathervint #endif !------------------------------Arguments-------------------------------- integer, intent(in) :: fdim ! declared length of first dimension integer, intent(in) :: mdim ! declared length of middle dimension integer, intent(in) :: ldim ! declared length of last dimension integer, intent(in) :: hdim1d ! declared first horizontal index ! dimension integer, intent(in):: localchunks(fdim,pcols,mdim,begchunk:endchunk,ldim) ! local chunks !JR Changed globalfield to inout because slaves under lf95 pass a bogus argument, which will result !JR in trash being written to useful memory if intent(out) is specified. THIS SHOULD BE FIXED!!! integer, intent(inout) :: globalfield(fdim,hdim1d,mdim,hdim2_d,ldim) ! global field !---------------------------Local workspace----------------------------- integer :: f,i,m,l,p ! loop indices integer :: cid ! global chunk id integer :: lcid ! local chunk id integer :: lid ! local column index integer :: gcol ! global column index integer :: h1 ! first horizontal dimension index integer :: h2 ! second horizontal dimension index #if ( defined SPMD ) integer gfield_p(fdim,mdim,ldim,ngcols) ! vector to be gathered integer lfield_p(fdim,mdim,ldim,nlcols) ! local component of gather ! vector integer :: displs(0:npes-1) ! gather displacements integer :: rcvcnts(0:npes-1) ! gather receive count integer :: sendcnt ! gather send counts integer :: beglcol ! beginning index for local columns ! in global column ordering #endif !----------------------------------------------------------------------- if (hdim1d < hdim1_d) then call endrun ('GATHER_CHUNK_TO_FIELD_INT error: hdim1d < hdim1_d') endif #if ( defined SPMD ) displs(0) = 0 rcvcnts(0) = fdim*mdim*ldim*gs_col_num(0) beglcol = 0 do p=1,npes-1 displs(p) = displs(p-1) + rcvcnts(p-1) rcvcnts(p) = fdim*mdim*ldim*gs_col_num(p) if (p <= iam) then beglcol = beglcol + gs_col_num(p-1) endif enddo sendcnt = fdim*mdim*ldim*nlcols ! copy into local gather data structure do l=1,ldim !DIR$ PREFERVECTOR, PREFERSTREAM !DIR$ CONCURRENT do i=1,nlcols cid = pgcols(beglcol+i)%chunk lcid = chunks(cid)%lcid lid = pgcols(beglcol+i)%ccol do m=1,mdim do f=1,fdim lfield_p(f, m, l, i) = & localchunks(f,lid,m,lcid,l) end do end do end do end do ! gather from other processes call t_barrierf('sync_gath_ctof', mpicom) call fc_gathervint(lfield_p, sendcnt, mpiint, & gfield_p, rcvcnts, displs, mpiint, 0, mpicom) if (masterproc) then ! copy gathered columns into lon/lat field !DIR$ PREFERVECTOR, PREFERSTREAM !DIR$ CONCURRENT do i=1,ngcols_p cid = pgcols(i)%chunk lid = pgcols(i)%ccol gcol = chunks(cid)%gcol(lid) h2 = (gcol-1)/hdim1_d + 1 h1 = mod((gcol-1),hdim1_d) + 1 do l=1,ldim do m=1,mdim do f=1,fdim globalfield(f, h1, m, h2, l) & = gfield_p(f,m,l,i) end do end do end do end do endif #else ! copy chunked data structure into lon/lat field ! (pgcol ordering chosen to reflect begchunk:endchunk ! local ordering) do l=1,ldim !DIR$ PREFERVECTOR, PREFERSTREAM !DIR$ CONCURRENT do i=1,ngcols_p cid = pgcols(i)%chunk lcid = chunks(cid)%lcid lid = pgcols(i)%ccol gcol = chunks(cid)%gcol(lid) h2 = (gcol-1)/hdim1_d + 1 h1 = mod((gcol-1),hdim1_d) + 1 do m=1,mdim do f=1,fdim globalfield(f, h1, m, h2, l) & = localchunks(f,lid,m,lcid,l) end do end do end do end do #endif return end subroutine gather_chunk_to_field_int ! !======================================================================== ! subroutine write_field_from_chunk(iu,fdim,mdim,ldim,localchunks),2 !----------------------------------------------------------------------- ! ! ! Purpose: Write field from decomposed chunk data ! structure ! ! Method: ! ! Author: Patrick Worley ! !------------------------------Arguments-------------------------------- integer, intent(in) :: iu ! logical unit integer, intent(in) :: fdim ! declared length of first dimension integer, intent(in) :: mdim ! declared length of middle dimension integer, intent(in) :: ldim ! declared length of last dimension real(r8), intent(in):: localchunks(fdim,pcols,mdim,begchunk:endchunk,ldim) ! local chunks !---------------------------Local workspace----------------------------- integer :: ioerr ! error return real(r8), allocatable :: globalfield(:,:,:,:,:) ! global field !----------------------------------------------------------------------- allocate(globalfield(fdim,hdim1_d,mdim,hdim2_d,ldim)) call gather_chunk_to_field (fdim,mdim,ldim,hdim1_d,localchunks,globalfield) if (masterproc) then write (iu,iostat=ioerr) globalfield if (ioerr /= 0 ) then write(iulog,*) 'WRITE_FIELD_FROM_CHUNK ioerror ', ioerr,' on i/o unit = ',iu call endrun end if endif deallocate(globalfield) return end subroutine write_field_from_chunk ! !======================================================================== ! subroutine read_chunk_from_field(iu,fdim,mdim,ldim,localchunks),2 !----------------------------------------------------------------------- ! ! ! Purpose: Write field from decomposed chunk data ! structure ! ! Method: ! ! Author: Patrick Worley ! !------------------------------Arguments-------------------------------- integer, intent(in) :: iu ! logical unit integer, intent(in) :: fdim ! declared length of first dimension integer, intent(in) :: mdim ! declared length of middle dimension integer, intent(in) :: ldim ! declared length of last dimension real(r8), intent(out):: localchunks(fdim,pcols,mdim,begchunk:endchunk,ldim) ! local chunks !---------------------------Local workspace----------------------------- integer :: ioerr ! error return real(r8), allocatable :: globalfield(:,:,:,:,:) ! global field !----------------------------------------------------------------------- allocate(globalfield(fdim,hdim1_d,mdim,hdim2_d,ldim)) if (masterproc) then read (iu,iostat=ioerr) globalfield if (ioerr /= 0 ) then write(iulog,*) 'READ_CHUNK_FROM_FIELD ioerror ', ioerr,' on i/o unit = ',iu call endrun end if endif call scatter_field_to_chunk (fdim,mdim,ldim,hdim1_d,globalfield,localchunks) deallocate(globalfield) return end subroutine read_chunk_from_field ! !======================================================================== subroutine transpose_block_to_chunk(record_size, block_buffer, & 1,10 chunk_buffer, window) !----------------------------------------------------------------------- ! ! Purpose: Transpose buffer containing decomposed ! fields to buffer ! containing decomposed chunk data structures ! ! Method: ! ! Author: Patrick Worley ! Modified: Art Mirin, Jan 04, to add support for mod_comm ! !----------------------------------------------------------------------- #if ( defined SPMD ) # if defined(MODCM_DP_TRANSPOSE) use mod_comm, only: blockdescriptor, mp_sendirr, mp_recvirr, & get_partneroffset, max_nparcels use mpishorthand, only : mpicom # endif use spmd_utils, only: altalltoallv #endif !------------------------------Parameters------------------------------- ! integer, parameter :: msgtag = 6000 !------------------------------Arguments-------------------------------- integer, intent(in) :: record_size ! per column amount of data real(r8), intent(in) :: block_buffer(record_size*block_buf_nrecs) ! buffer of block data to be ! transposed real(r8), intent(out):: chunk_buffer(record_size*chunk_buf_nrecs) ! buffer of chunk data ! transposed into integer, intent(in), optional :: window ! MPI-2 window id for ! chunk_buffer !---------------------------Local workspace----------------------------- #if ( defined SPMD ) integer :: i, p ! loop indices integer :: bbuf_siz ! size of block_buffer integer :: cbuf_siz ! size of chunk_buffer integer :: lwindow ! placeholder for missing window integer :: lopt ! local copy of phys_alltoall ! logical, save :: first = .true. integer, allocatable, save :: sndcnts(:), sdispls(:) integer, allocatable, save :: rcvcnts(:), rdispls(:) integer, allocatable, save :: pdispls(:) integer, save :: prev_record_size = 0 # if defined(MODCM_DP_TRANSPOSE) type (blockdescriptor), allocatable, save :: sendbl(:), recvbl(:) integer ione, ierror, mod_method # endif !----------------------------------------------------------------------- if (first) then ! Compute send/recv/put counts and displacements allocate(sndcnts(0:npes-1)) allocate(sdispls(0:npes-1)) allocate(rcvcnts(0:npes-1)) allocate(rdispls(0:npes-1)) allocate(pdispls(0:npes-1)) ! # if defined(MODCM_DP_TRANSPOSE) ! This branch uses mod_comm. Admissable values of phys_alltoall are ! 11,12 and 13. Each value corresponds to a different option ! within mod_comm of implementing the communication. That option is expressed ! internally to mod_comm using the variable mod_method defined below; ! mod_method will have values 0,1 or 2 and is defined as ! phys_alltoall - modmin_alltoall, where modmin_alltoall equals 11. ! Also, sendbl and recvbl must have exactly npes elements, to match ! this size of the communicator, or the transpose will fail. ! if (phys_alltoall .ge. modmin_alltoall) then mod_method = phys_alltoall - modmin_alltoall ione = 1 allocate( sendbl(0:npes-1) ) allocate( recvbl(0:npes-1) ) do p = 0,npes-1 sendbl(p)%method = mod_method recvbl(p)%method = mod_method allocate( sendbl(p)%blocksizes(1) ) allocate( sendbl(p)%displacements(1) ) allocate( recvbl(p)%blocksizes(1) ) allocate( recvbl(p)%displacements(1) ) enddo endif # endif first = .false. endif ! if (record_size .ne. prev_record_size) then ! ! Compute send/recv/put counts and displacements sdispls(0) = 0 sndcnts(0) = record_size*btofc_blk_num(0) do p=1,npes-1 sdispls(p) = sdispls(p-1) + sndcnts(p-1) sndcnts(p) = record_size*btofc_blk_num(p) enddo ! rdispls(0) = 0 rcvcnts(0) = record_size*btofc_chk_num(0) do p=1,npes-1 rdispls(p) = rdispls(p-1) + rcvcnts(p-1) rcvcnts(p) = record_size*btofc_chk_num(p) enddo ! call mpialltoallint(rdispls, 1, pdispls, 1, mpicom) ! # if defined(MODCM_DP_TRANSPOSE) if (phys_alltoall .ge. modmin_alltoall) then do p = 0,npes-1 sendbl(p)%type = MPI_DATATYPE_NULL if ( sndcnts(p) .ne. 0 ) then if (phys_alltoall .gt. modmin_alltoall) then call MPI_TYPE_INDEXED(ione, sndcnts(p), & sdispls(p), mpir8, & sendbl(p)%type, ierror) call MPI_TYPE_COMMIT(sendbl(p)%type, ierror) endif sendbl(p)%blocksizes(1) = sndcnts(p) sendbl(p)%displacements(1) = sdispls(p) sendbl(p)%partneroffset = 0 else sendbl(p)%blocksizes(1) = 0 sendbl(p)%displacements(1) = 0 sendbl(p)%partneroffset = 0 endif sendbl(p)%nparcels = size(sendbl(p)%displacements) sendbl(p)%tot_size = sum(sendbl(p)%blocksizes) max_nparcels = max(max_nparcels, sendbl(p)%nparcels) recvbl(p)%type = MPI_DATATYPE_NULL if ( rcvcnts(p) .ne. 0) then if (phys_alltoall .gt. modmin_alltoall) then call MPI_TYPE_INDEXED(ione, rcvcnts(p), & rdispls(p), mpir8, & recvbl(p)%type, ierror) call MPI_TYPE_COMMIT(recvbl(p)%type, ierror) endif recvbl(p)%blocksizes(1) = rcvcnts(p) recvbl(p)%displacements(1) = rdispls(p) recvbl(p)%partneroffset = 0 ! not properly initialized - do not use Mpi2 else recvbl(p)%blocksizes(1) = 0 recvbl(p)%displacements(1) = 0 recvbl(p)%partneroffset = 0 endif recvbl(p)%nparcels = size(recvbl(p)%displacements) recvbl(p)%tot_size = sum(recvbl(p)%blocksizes) max_nparcels = max(max_nparcels, recvbl(p)%nparcels) enddo call get_partneroffset(mpicom, sendbl, recvbl) endif # endif ! prev_record_size = record_size endif ! call t_barrierf('sync_tran_btoc', mpicom) if (phys_alltoall < 0) then if ((max_nproc_smpx > npes/2) .and. (nproc_busy_d > npes/2)) then lopt = 0 else lopt = 1 endif else lopt = phys_alltoall if ((lopt .eq. 2) .and. ( .not. present(window) )) lopt = 1 endif if (lopt < 4) then ! bbuf_siz = record_size*block_buf_nrecs cbuf_siz = record_size*chunk_buf_nrecs if ( present(window) ) then call altalltoallv(lopt, iam, npes, & dp_coup_steps, dp_coup_proc, & block_buffer, bbuf_siz, sndcnts, sdispls, mpir8, & chunk_buffer, cbuf_siz, rcvcnts, rdispls, mpir8, & msgtag, pdispls, mpir8, window, mpicom) else call altalltoallv(lopt, iam, npes, & dp_coup_steps, dp_coup_proc, & block_buffer, bbuf_siz, sndcnts, sdispls, mpir8, & chunk_buffer, cbuf_siz, rcvcnts, rdispls, mpir8, & msgtag, pdispls, mpir8, lwindow, mpicom) endif ! else ! # if defined(MODCM_DP_TRANSPOSE) call mp_sendirr(mpicom, sendbl, recvbl, block_buffer, chunk_buffer) call mp_recvirr(mpicom, sendbl, recvbl, block_buffer, chunk_buffer) # else call mpialltoallv(block_buffer, sndcnts, sdispls, mpir8, & chunk_buffer, rcvcnts, rdispls, mpir8, & mpicom) # endif ! endif ! #endif return end subroutine transpose_block_to_chunk ! !======================================================================== subroutine block_to_chunk_send_pters(blockid, fdim, ldim, & 1,1 record_size, pter) !----------------------------------------------------------------------- ! ! Purpose: Return pointers into send buffer where column from decomposed ! fields should be copied to ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- !------------------------------Arguments-------------------------------- integer, intent(in) :: blockid ! block index integer, intent(in) :: fdim ! first dimension of pter array integer, intent(in) :: ldim ! last dimension of pter array integer, intent(in) :: record_size ! per coordinate amount of data integer, intent(out) :: pter(fdim,ldim) ! buffer offsets !---------------------------Local workspace----------------------------- integer :: i, k ! loop indices !----------------------------------------------------------------------- if ((btofc_blk_offset(blockid)%ncols > fdim) .or. & (btofc_blk_offset(blockid)%nlvls > ldim)) then write(iulog,*) "BLOCK_TO_CHUNK_SEND_PTERS: pter array dimensions ", & "not large enough: (",fdim,",",ldim,") not >= (", & btofc_blk_offset(blockid)%ncols,",", & btofc_blk_offset(blockid)%nlvls,")" call endrun() endif ! do k=1,btofc_blk_offset(blockid)%nlvls do i=1,btofc_blk_offset(blockid)%ncols pter(i,k) = 1 + record_size* & (btofc_blk_offset(blockid)%pter(i,k)) enddo do i=btofc_blk_offset(blockid)%ncols+1,fdim pter(i,k) = -1 enddo enddo ! do k=btofc_blk_offset(blockid)%nlvls+1,ldim do i=1,fdim pter(i,k) = -1 enddo enddo ! return end subroutine block_to_chunk_send_pters ! !======================================================================== subroutine block_to_chunk_recv_pters(lcid, fdim, ldim, & 1,1 record_size, pter) !----------------------------------------------------------------------- ! ! Purpose: Return pointers into receive buffer where data for ! decomposed chunk data structures should be copied from ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: fdim ! first dimension of pter array integer, intent(in) :: ldim ! last dimension of pter array integer, intent(in) :: record_size ! per coordinate amount of data integer, intent(out) :: pter(fdim,ldim) ! buffer offset !---------------------------Local workspace----------------------------- integer :: i, k ! loop indices !----------------------------------------------------------------------- if ((btofc_chk_offset(lcid)%ncols > fdim) .or. & (btofc_chk_offset(lcid)%nlvls > ldim)) then write(iulog,*) "BLOCK_TO_CHUNK_RECV_PTERS: pter array dimensions ", & "not large enough: (",fdim,",",ldim,") not >= (", & btofc_chk_offset(lcid)%ncols,",", & btofc_chk_offset(lcid)%nlvls,")" call endrun() endif ! do k=1,btofc_chk_offset(lcid)%nlvls do i=1,btofc_chk_offset(lcid)%ncols pter(i,k) = 1 + record_size* & (btofc_chk_offset(lcid)%pter(i,k)) enddo do i=btofc_chk_offset(lcid)%ncols+1,fdim pter(i,k) = -1 enddo enddo ! do k=btofc_chk_offset(lcid)%nlvls+1,ldim do i=1,fdim pter(i,k) = -1 enddo enddo ! return end subroutine block_to_chunk_recv_pters ! !======================================================================== subroutine transpose_chunk_to_block(record_size, chunk_buffer, & 1,10 block_buffer, window) !----------------------------------------------------------------------- ! ! Purpose: Transpose buffer containing decomposed ! chunk data structures to buffer ! containing decomposed fields ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- #if ( defined SPMD ) # if defined(MODCM_DP_TRANSPOSE) use mod_comm, only: blockdescriptor, mp_sendirr, mp_recvirr, & get_partneroffset, max_nparcels use mpishorthand, only : mpicom # endif use spmd_utils, only: altalltoallv #endif !------------------------------Parameters------------------------------- ! integer, parameter :: msgtag = 7000 !------------------------------Arguments-------------------------------- integer, intent(in) :: record_size ! per column amount of data real(r8), intent(in):: chunk_buffer(record_size*chunk_buf_nrecs) ! buffer of chunk data to be ! transposed real(r8), intent(out) :: block_buffer(record_size*block_buf_nrecs) ! buffer of block data to ! transpose into integer, intent(in), optional :: window ! MPI-2 window id for ! chunk_buffer !---------------------------Local workspace----------------------------- #if ( defined SPMD ) integer :: i, p ! loop indices integer :: bbuf_siz ! size of block_buffer integer :: cbuf_siz ! size of chunk_buffer integer :: lwindow ! placeholder for missing window integer :: lopt ! local copy of phys_alltoall ! logical, save :: first = .true. integer, allocatable, save :: sndcnts(:), sdispls(:) integer, allocatable, save :: rcvcnts(:), rdispls(:) integer, allocatable, save :: pdispls(:) integer, save :: prev_record_size = 0 # if defined(MODCM_DP_TRANSPOSE) type (blockdescriptor), allocatable, save :: sendbl(:), recvbl(:) integer ione, ierror, mod_method # endif !----------------------------------------------------------------------- if (first) then ! Compute send/recv/put counts and displacements allocate(sndcnts(0:npes-1)) allocate(sdispls(0:npes-1)) allocate(rcvcnts(0:npes-1)) allocate(rdispls(0:npes-1)) allocate(pdispls(0:npes-1)) ! # if defined(MODCM_DP_TRANSPOSE) ! This branch uses mod_comm. Admissable values of phys_alltoall are ! 11,12 and 13. Each value corresponds to a differerent option ! within mod_comm of implementing the communication. That option is expressed ! internally to mod_comm using the variable mod_method defined below; ! mod_method will have values 0,1 or 2 and is defined as ! phys_alltoall - modmin_alltoall, where modmin_alltoall equals 11. ! Also, sendbl and recvbl must have exactly npes elements, to match ! this size of the communicator, or the transpose will fail. ! if (phys_alltoall .ge. modmin_alltoall) then mod_method = phys_alltoall - modmin_alltoall ione = 1 allocate( sendbl(0:npes-1) ) allocate( recvbl(0:npes-1) ) do p = 0,npes-1 sendbl(p)%method = mod_method recvbl(p)%method = mod_method allocate( sendbl(p)%blocksizes(1) ) allocate( sendbl(p)%displacements(1) ) allocate( recvbl(p)%blocksizes(1) ) allocate( recvbl(p)%displacements(1) ) enddo endif # endif ! first = .false. endif ! if (record_size .ne. prev_record_size) then ! ! Compute send/recv/put counts and displacements sdispls(0) = 0 sndcnts(0) = record_size*btofc_chk_num(0) do p=1,npes-1 sdispls(p) = sdispls(p-1) + sndcnts(p-1) sndcnts(p) = record_size*btofc_chk_num(p) enddo ! rdispls(0) = 0 rcvcnts(0) = record_size*btofc_blk_num(0) do p=1,npes-1 rdispls(p) = rdispls(p-1) + rcvcnts(p-1) rcvcnts(p) = record_size*btofc_blk_num(p) enddo ! call mpialltoallint(rdispls, 1, pdispls, 1, mpicom) ! # if defined(MODCM_DP_TRANSPOSE) if (phys_alltoall .ge. modmin_alltoall) then do p = 0,npes-1 sendbl(p)%type = MPI_DATATYPE_NULL if ( sndcnts(p) .ne. 0 ) then if (phys_alltoall .gt. modmin_alltoall) then call MPI_TYPE_INDEXED(ione, sndcnts(p), & sdispls(p), mpir8, & sendbl(p)%type, ierror) call MPI_TYPE_COMMIT(sendbl(p)%type, ierror) endif sendbl(p)%blocksizes(1) = sndcnts(p) sendbl(p)%displacements(1) = sdispls(p) sendbl(p)%partneroffset = 0 else sendbl(p)%blocksizes(1) = 0 sendbl(p)%displacements(1) = 0 sendbl(p)%partneroffset = 0 endif sendbl(p)%nparcels = size(sendbl(p)%displacements) sendbl(p)%tot_size = sum(sendbl(p)%blocksizes) max_nparcels = max(max_nparcels, sendbl(p)%nparcels) recvbl(p)%type = MPI_DATATYPE_NULL if ( rcvcnts(p) .ne. 0) then if (phys_alltoall .gt. modmin_alltoall) then call MPI_TYPE_INDEXED(ione, rcvcnts(p), & rdispls(p), mpir8, & recvbl(p)%type, ierror) call MPI_TYPE_COMMIT(recvbl(p)%type, ierror) endif recvbl(p)%blocksizes(1) = rcvcnts(p) recvbl(p)%displacements(1) = rdispls(p) recvbl(p)%partneroffset = 0 ! not properly initialized - do not use Mpi2 else recvbl(p)%blocksizes(1) = 0 recvbl(p)%displacements(1) = 0 recvbl(p)%partneroffset = 0 endif recvbl(p)%nparcels = size(recvbl(p)%displacements) recvbl(p)%tot_size = sum(recvbl(p)%blocksizes) max_nparcels = max(max_nparcels, recvbl(p)%nparcels) enddo call get_partneroffset(mpicom, sendbl, recvbl) endif # endif ! prev_record_size = record_size endif ! call t_barrierf('sync_tran_ctob', mpicom) if (phys_alltoall < 0) then if ((max_nproc_smpx > npes/2) .and. (nproc_busy_d > npes/2)) then lopt = 0 else lopt = 1 endif else lopt = phys_alltoall if ((lopt .eq. 2) .and. ( .not. present(window) )) lopt = 1 endif if (lopt < 4) then ! bbuf_siz = record_size*block_buf_nrecs cbuf_siz = record_size*chunk_buf_nrecs if ( present(window) ) then call altalltoallv(lopt, iam, npes, & dp_coup_steps, dp_coup_proc, & chunk_buffer, cbuf_siz, sndcnts, sdispls, mpir8, & block_buffer, bbuf_siz, rcvcnts, rdispls, mpir8, & msgtag, pdispls, mpir8, window, mpicom) else call altalltoallv(lopt, iam, npes, & dp_coup_steps, dp_coup_proc, & chunk_buffer, cbuf_siz, sndcnts, sdispls, mpir8, & block_buffer, bbuf_siz, rcvcnts, rdispls, mpir8, & msgtag, pdispls, mpir8, lwindow, mpicom) endif ! else # if defined(MODCM_DP_TRANSPOSE) call mp_sendirr(mpicom, sendbl, recvbl, block_buffer, chunk_buffer) call mp_recvirr(mpicom, sendbl, recvbl, block_buffer, chunk_buffer) # else call mpialltoallv(chunk_buffer, sndcnts, sdispls, mpir8, & block_buffer, rcvcnts, rdispls, mpir8, & mpicom) # endif ! endif ! #endif return end subroutine transpose_chunk_to_block ! !======================================================================== subroutine chunk_to_block_send_pters(lcid, fdim, ldim, & 1,1 record_size, pter) !----------------------------------------------------------------------- ! ! Purpose: Return pointers into send buffer where data for ! decomposed chunk data structures should be copied to ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- !------------------------------Arguments-------------------------------- integer, intent(in) :: lcid ! local chunk id integer, intent(in) :: fdim ! first dimension of pter array integer, intent(in) :: ldim ! last dimension of pter array integer, intent(in) :: record_size ! per coordinate amount of data integer, intent(out) :: pter(fdim,ldim) ! buffer offset !---------------------------Local workspace----------------------------- integer :: i, k ! loop indices !----------------------------------------------------------------------- if ((btofc_chk_offset(lcid)%ncols > fdim) .or. & (btofc_chk_offset(lcid)%nlvls > ldim)) then write(iulog,*) "CHUNK_TO_BLOCK_SEND_PTERS: pter array dimensions ", & "not large enough: (",fdim,",",ldim,") not >= (", & btofc_chk_offset(lcid)%ncols,",", & btofc_chk_offset(lcid)%nlvls,")" call endrun() endif ! do k=1,btofc_chk_offset(lcid)%nlvls do i=1,btofc_chk_offset(lcid)%ncols pter(i,k) = 1 + record_size* & (btofc_chk_offset(lcid)%pter(i,k)) enddo do i=btofc_chk_offset(lcid)%ncols+1,fdim pter(i,k) = -1 enddo enddo ! do k=btofc_chk_offset(lcid)%nlvls+1,ldim do i=1,fdim pter(i,k) = -1 enddo enddo ! return end subroutine chunk_to_block_send_pters ! !======================================================================== subroutine chunk_to_block_recv_pters(blockid, fdim, ldim, & 1,1 record_size, pter) !----------------------------------------------------------------------- ! ! Purpose: Return pointers into receive buffer where column from decomposed ! fields should be copied from ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- !------------------------------Arguments-------------------------------- integer, intent(in) :: blockid ! block index integer, intent(in) :: fdim ! first dimension of pter array integer, intent(in) :: ldim ! last dimension of pter array integer, intent(in) :: record_size ! per coordinate amount of data integer, intent(out) :: pter(fdim,ldim) ! buffer offsets !---------------------------Local workspace----------------------------- integer :: i, k ! loop indices !----------------------------------------------------------------------- if ((btofc_blk_offset(blockid)%ncols > fdim) .or. & (btofc_blk_offset(blockid)%nlvls > ldim)) then write(iulog,*) "CHUNK_TO_BLOCK_RECV_PTERS: pter array dimensions ", & "not large enough: (",fdim,",",ldim,") not >= (", & btofc_blk_offset(blockid)%ncols,",", & btofc_blk_offset(blockid)%nlvls,")" call endrun() endif ! do k=1,btofc_blk_offset(blockid)%nlvls do i=1,btofc_blk_offset(blockid)%ncols pter(i,k) = 1 + record_size* & (btofc_blk_offset(blockid)%pter(i,k)) enddo do i=btofc_blk_offset(blockid)%ncols+1,fdim pter(i,k) = -1 enddo enddo ! do k=btofc_blk_offset(blockid)%nlvls+1,ldim do i=1,fdim pter(i,k) = -1 enddo enddo ! return end subroutine chunk_to_block_recv_pters ! !======================================================================== subroutine create_chunks(opt, chunks_per_thread) 1,21 !----------------------------------------------------------------------- ! ! Purpose: Decompose physics computational grid into chunks, for ! improved serial efficiency and parallel load balance. ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use pmgrid, only: plev use dyn_grid, only: get_block_bounds_d, get_block_gcol_cnt_d, & get_gcol_block_cnt_d, get_gcol_block_d, & get_block_owner_d, get_block_gcol_d !------------------------------Arguments-------------------------------- integer, intent(in) :: opt ! chunking option ! 0: chunks may cross block boundaries, but retain same ! process mapping as blocks. If possible, columns assigned ! as day/night pairs. Columns (or pairs) are wrap-mapped. ! May not work with vertically decomposed blocks. (default) ! 1: chunks may cross block boundaries, but retain same ! SMP-node mapping as blocks. If possible, columns assigned ! as day/night pairs. Columns (or pairs) are wrap-mapped. ! May not work with vertically decomposed blocks. ! 2: 2-column day/night and season column pairs wrap-mapped ! to chunks to also balance assignment of polar, mid-latitude, ! and equatorial columns across chunks. ! 3: same as 1 except that SMP defined to be pairs of consecutive ! processes ! 4: chunks may cross block boundaries, but retain same ! process mapping as blocks. Columns assigned to chunks ! in block ordering. ! May not work with vertically decomposed blocks. ! 5: Chunks do not cross latitude boundaries, and are block-mapped. integer, intent(in) :: chunks_per_thread ! target number of chunks per ! thread !---------------------------Local workspace----------------------------- integer :: i, j, p ! loop indices integer :: nlthreads ! number of local OpenMP threads integer :: npthreads(0:npes-1) ! number of OpenMP threads per process integer :: proc_smp_mapx(0:npes-1) ! process/virtual SMP node map integer :: firstblock, lastblock ! global block index bounds integer :: maxblksiz ! maximum number of columns in a dynamics block integer :: block_cnt ! number of blocks containing data ! for a given vertical column integer :: blockids(plev+1) ! block indices integer :: bcids(plev+1) ! block column indices integer :: nsmpx, nsmpy ! virtual SMP node counts and indices integer :: curgcol, twingcol ! global physics and dynamics column indices integer :: smp ! SMP node index integer :: cid ! chunk id integer :: jb, ib ! global block and columns indices integer :: blksiz ! current block size integer :: ntmp1, ntmp2, nlchunks ! work variables integer :: cbeg ! beginning longitude index for ! current chunk integer :: max_ncols ! upper bound on number of columns in a block integer :: ncols ! number of columns in current chunk logical :: error ! error flag ! indices for dynamics columns in given block integer, dimension(:), allocatable :: cols ! number of MPI processes per virtual SMP node (0:nsmpx-1) integer, dimension(:), allocatable :: nsmpprocs ! flag indicating whether a process is busy or idle during the dynamics (0:npes-1) logical, dimension(:), allocatable :: proc_busy_d ! flag indicating whether any of the processes assigned to an SMP node are busy ! during the dynamics, or whether all of them are idle (0:nsmps-1) logical, dimension(:), allocatable :: smp_busy_d ! actual SMP node/virtual SMP node map (0:nsmps-1) integer, dimension(:), allocatable :: smp_smp_mapx ! column/virtual SMP node map (ngcols) integer, dimension(:), allocatable :: col_smp_mapx ! number of columns assigned to a given virtual SMP node (0:nsmpx-1) integer, dimension(:), allocatable :: nsmpcolumns ! number of OpenMP threads per virtual SMP node (0:nsmpx-1) integer, dimension(:), allocatable :: nsmpthreads ! number of chunks assigned to a given virtual SMP node (0:nsmpx-1) integer, dimension(:), allocatable :: nsmpchunks ! maximum number of columns assigned to a chunk in a given virtual SMP node (0:nsmpx-1) integer, dimension(:), allocatable :: maxcol_chk ! number of chunks in given virtual SMP node receiving maximum number of columns ! (0:nsmpx-1) integer, dimension(:), allocatable :: maxcol_chks ! chunk id virtual offset (0:nsmpx-1) integer, dimension(:), allocatable :: cid_offset ! process-local chunk id (0:nsmpx-1) integer, dimension(:), allocatable :: local_cid #if ( defined _OPENMP ) integer omp_get_max_threads external omp_get_max_threads #endif !----------------------------------------------------------------------- ! ! Determine number of threads per process ! nlthreads = 1 #if ( defined _OPENMP ) nlthreads = OMP_GET_MAX_THREADS() #endif ! #if ( defined SPMD ) call mpiallgatherint(nlthreads, 1, npthreads, 1, mpicom) #else npthreads(0) = nlthreads proc_smp_map(0) = 0 #endif ! ! Determine index range for dynamics blocks ! call get_block_bounds_d(firstblock,lastblock) ! ! Determine maximum number of columns in a block ! maxblksiz = 0 do jb=firstblock,lastblock maxblksiz = max(maxblksiz,get_block_gcol_cnt_d(jb)) enddo ! ! determine which (and how many) processes are assigned ! dynamics blocks ! allocate( proc_busy_d(0:npes-1) ) proc_busy_d = .false. nproc_busy_d = 0 do jb=firstblock,lastblock p = get_block_owner_d(jb) if (.not. proc_busy_d(p) ) then proc_busy_d(p) = .true. nproc_busy_d = nproc_busy_d + 1 endif enddo ! ! Determine virtual SMP count and processes/virtual SMP map. ! If option 0 or >3, pretend that each SMP has only one process. ! If option 1, use SMP information. ! If option 2, pretend that all processes are in one SMP node. ! If option 3, pretend that each SMP node is made up of two ! processes, chosen to maximize load-balancing opportunities. ! ! For all options < 5, if there are "idle" dynamics processes, ! assign them to the virtual SMP nodes in wrap fashion. ! Communication between the active and idle dynamics ! processes is scatter/gather (no communications between ! idle dynamics processes) so there is no advantage to ! blocking the idle processes in these assignments. ! if ((opt <= 0) .or. (opt == 4)) then ! assign active dynamics processes to virtual SMP nodes nsmpx = 0 do p=0,npes-1 if (proc_busy_d(p)) then proc_smp_mapx(p) = nsmpx nsmpx = nsmpx + 1 endif enddo ! ! assign idle dynamics processes to virtual SMP nodes (wrap map) nsmpy = 0 do p=0,npes-1 if (.not. proc_busy_d(p)) then proc_smp_mapx(p) = nsmpy nsmpy = mod(nsmpy+1,nsmpx) endif enddo elseif (opt == 1) then allocate( smp_busy_d(0:nsmps-1) ) allocate( smp_smp_mapx(0:nsmps-1) ) ! ! determine SMP nodes assigned dynamics blocks smp_busy_d = .false. do p=0,npes-1 if ( proc_busy_d(p) ) then smp = proc_smp_map(p) smp_busy_d(smp) = .true. endif enddo ! ! determine number of SMP nodes assigned dynamics blocks nsmpx = 0 do smp=0,nsmps-1 if (smp_busy_d(smp)) then smp_smp_mapx(smp) = nsmpx nsmpx = nsmpx + 1 endif enddo ! ! assign processes in active dynamics SMP nodes to virtual SMP nodes do p=0,npes-1 smp = proc_smp_map(p) if (smp_busy_d(smp)) then proc_smp_mapx(p) = smp_smp_mapx(smp) endif enddo ! ! assign processes in idle dynamics SMP nodes to virtual SMP nodes (wrap map) nsmpy = 0 do p=0,npes-1 smp = proc_smp_map(p) if (.not. smp_busy_d(smp)) then proc_smp_mapx(p) = nsmpy nsmpy = mod(nsmpy+1,nsmpx) endif enddo ! deallocate( smp_busy_d ) deallocate( smp_smp_mapx ) elseif (opt == 2) then nsmpx = 1 do p=0,npes-1 proc_smp_mapx(p) = 0 enddo elseif (opt == 3) then ! find active process partners proc_smp_mapx = -1 call find_partners(opt,proc_busy_d,nsmpx,proc_smp_mapx) ! ! assign unassigned (idle dynamics) processes to virtual SMP nodes ! (wrap map) nsmpy = 0 do p=0,npes-1 if (proc_smp_mapx(p) .eq. -1) then proc_smp_mapx(p) = nsmpy nsmpy = mod(nsmpy+1,nsmpx) endif enddo else nsmpx = npes do p=0,npes-1 proc_smp_mapx(p) = p enddo endif ! deallocate( proc_busy_d ) ! ! Determine maximum number of processes assigned to a single ! virtual SMP node ! allocate( nsmpprocs(0:nsmpx-1) ) ! nsmpprocs(:) = 0 do p=0,npes-1 smp = proc_smp_mapx(p) nsmpprocs(smp) = nsmpprocs(smp) + 1 enddo max_nproc_smpx = maxval(nsmpprocs) ! deallocate( nsmpprocs ) ! ! Determine number of columns assigned to each ! virtual SMP in block decomposition allocate( col_smp_mapx(ngcols) ) ! col_smp_mapx(:) = -1 error = .false. do i=1,ngcols_p curgcol = latlon_to_dyn_gcol_map(i) block_cnt = get_gcol_block_cnt_d(curgcol) call get_gcol_block_d(curgcol,block_cnt,blockids,bcids) do jb=1,block_cnt p = get_block_owner_d(blockids(jb)) if (col_smp_mapx(i) .eq. -1) then col_smp_mapx(i) = proc_smp_mapx(p) elseif (col_smp_mapx(i) .ne. proc_smp_mapx(p)) then error = .true. endif enddo end do if (error) then write(iulog,*) "PHYS_GRID_INIT error: opt", opt, "specified, ", & "but vertical decomposition not limited to virtual SMP" call endrun() endif ! allocate( nsmpcolumns(0:nsmpx-1) ) nsmpcolumns(:) = 0 do i=1,ngcols_p curgcol = latlon_to_dyn_gcol_map(i) smp = col_smp_mapx(curgcol) nsmpcolumns(smp) = nsmpcolumns(smp) + 1 end do ! deallocate( col_smp_mapx ) ! ! Allocate other work space ! allocate( nsmpthreads(0:nsmpx-1) ) allocate( nsmpchunks (0:nsmpx-1) ) allocate( maxcol_chk (0:nsmpx-1) ) allocate( maxcol_chks(0:nsmpx-1) ) allocate( cid_offset (0:nsmpx-1) ) allocate( local_cid (0:nsmpx-1) ) allocate( cols(1:maxblksiz) ) ! ! Options 0-3: split local dynamics blocks into chunks, ! using wrap-map assignment of columns and ! day/night and north/south column pairs ! to chunks to improve load balance ! Option 0: local is per process ! Option 1: local is subset of`processes assigned to same SMP node ! Option 2: local is global ! Option 3: local is pair of processes chosen to maximize load-balance ! wrt restriction that only communicate with one other ! process. ! Option 4: split local dynamics blocks into chunks, ! using block-map assignment of columns ! if ((opt >= 0) .and. (opt <= 4)) then ! ! Calculate number of threads available in each SMP node. ! nsmpthreads(:) = 0 do p=0,npes-1 smp = proc_smp_mapx(p) nsmpthreads(smp) = nsmpthreads(smp) + npthreads(p) enddo ! ! Determine number of chunks to keep all threads busy ! nchunks = 0 do smp=0,nsmpx-1 nsmpchunks(smp) = nsmpcolumns(smp)/pcols if (mod(nsmpcolumns(smp), pcols) .ne. 0) then nsmpchunks(smp) = nsmpchunks(smp) + 1 endif if (nsmpchunks(smp) < chunks_per_thread*nsmpthreads(smp)) then nsmpchunks(smp) = chunks_per_thread*nsmpthreads(smp) endif do while (mod(nsmpchunks(smp), nsmpthreads(smp)) .ne. 0) nsmpchunks(smp) = nsmpchunks(smp) + 1 enddo if (nsmpchunks(smp) > nsmpcolumns(smp)) then nsmpchunks(smp) = nsmpcolumns(smp) endif nchunks = nchunks + nsmpchunks(smp) enddo ! ! Determine maximum number of columns to assign to chunks ! in a given SMP ! do smp=0,nsmpx-1 if (nsmpchunks(smp) /= 0) then ntmp1 = nsmpcolumns(smp)/nsmpchunks(smp) ntmp2 = mod(nsmpcolumns(smp),nsmpchunks(smp)) if (ntmp2 > 0) then maxcol_chk(smp) = ntmp1 + 1 maxcol_chks(smp) = ntmp2 else maxcol_chk(smp) = ntmp1 maxcol_chks(smp) = nsmpchunks(smp) endif else maxcol_chk(smp) = 0 maxcol_chks(smp) = 0 endif enddo ! ! Allocate chunks and knuhcs data structures ! allocate( chunks(1:nchunks) ) allocate( knuhcs(1:ngcols) ) ! ! Initialize chunks and knuhcs data structures ! chunks(:)%ncols = 0 knuhcs(:)%chunkid = -1 knuhcs(:)%col = -1 ! ! Determine chunk id ranges for each SMP ! cid_offset(0) = 1 local_cid(0) = 0 do smp=1,nsmpx-1 cid_offset(smp) = cid_offset(smp-1) + nsmpchunks(smp-1) local_cid(smp) = 0 enddo ! ! Assign columns to chunks ! do jb=firstblock,lastblock p = get_block_owner_d(jb) smp = proc_smp_mapx(p) blksiz = get_block_gcol_cnt_d(jb) call get_block_gcol_d(jb,blksiz,cols) do ib = 1,blksiz ! ! Assign column to a chunk if not already assigned curgcol = cols(ib) if ((dyn_to_latlon_gcol_map(curgcol) .ne. -1) .and. & (knuhcs(curgcol)%chunkid == -1)) then ! ! Find next chunk with space ! (maxcol_chks > 0 test necessary for opt=4 block map) cid = cid_offset(smp) + local_cid(smp) if (maxcol_chks(smp) > 0) then do while (chunks(cid)%ncols >= maxcol_chk(smp)) local_cid(smp) = mod(local_cid(smp)+1,nsmpchunks(smp)) cid = cid_offset(smp) + local_cid(smp) enddo else do while (chunks(cid)%ncols >= maxcol_chk(smp)-1) local_cid(smp) = mod(local_cid(smp)+1,nsmpchunks(smp)) cid = cid_offset(smp) + local_cid(smp) enddo endif chunks(cid)%ncols = chunks(cid)%ncols + 1 if (chunks(cid)%ncols .eq. maxcol_chk(smp)) & maxcol_chks(smp) = maxcol_chks(smp) - 1 ! i = chunks(cid)%ncols chunks(cid)%gcol(i) = curgcol chunks(cid)%lon(i) = lon_p(curgcol) chunks(cid)%lat(i) = lat_p(curgcol) knuhcs(curgcol)%chunkid = cid knuhcs(curgcol)%col = i ! if ((opt < 4) .and. (twin_alg > 0)) then ! ! If space available, look to assign a load-balancing "twin" to same chunk if ( (chunks(cid)%ncols < maxcol_chk(smp)) .and. & (maxcol_chks(smp) > 0) ) then call find_twin(curgcol, smp, & proc_smp_mapx, twingcol) if (twingcol > 0) then chunks(cid)%ncols = chunks(cid)%ncols + 1 if (chunks(cid)%ncols .eq. maxcol_chk(smp)) & maxcol_chks(smp) = maxcol_chks(smp) - 1 ! i = chunks(cid)%ncols chunks(cid)%gcol(i) = twingcol chunks(cid)%lon(i) = lon_p(twingcol) chunks(cid)%lat(i) = lat_p(twingcol) knuhcs(twingcol)%chunkid = cid knuhcs(twingcol)%col = i endif ! endif ! ! Move on to next chunk (wrap map) local_cid(smp) = mod(local_cid(smp)+1,nsmpchunks(smp)) ! endif ! endif enddo enddo ! else ! ! Option 5: split individual dynamics blocks into chunks, ! assigning consecutive columns to the same chunk ! ! Determine total number of chunks and ! number of chunks in each "SMP node" ! (assuming no vertical decomposition) nchunks = 0 nsmpchunks(:) = 0 do j=firstblock,lastblock blksiz = get_block_gcol_cnt_d(j) nlchunks = blksiz/pcols if (pcols*(blksiz/pcols) /= blksiz) then nlchunks = nlchunks + 1 endif nchunks = nchunks + nlchunks p = get_block_owner_d(j) nsmpchunks(p) = nsmpchunks(p) + nlchunks enddo ! ! Determine chunk id ranges for each SMP ! cid_offset(0) = 1 local_cid(0) = 0 do smp=1,nsmpx-1 cid_offset(smp) = cid_offset(smp-1) + nsmpchunks(smp-1) local_cid(smp) = 0 enddo ! ! Allocate chunks and knuhcs data structures ! allocate( chunks(1:nchunks) ) allocate( knuhcs(1:ngcols) ) ! ! Initialize chunks and knuhcs data structures ! knuhcs(:)%chunkid = -1 knuhcs(:)%col = -1 cid = 0 do jb=firstblock,lastblock p = get_block_owner_d(jb) smp = proc_smp_mapx(p) blksiz = get_block_gcol_cnt_d(jb) call get_block_gcol_d(jb,blksiz,cols) ib = 0 do while (ib < blksiz) cid = cid_offset(smp) + local_cid(smp) max_ncols = min(pcols,blksiz-ib) ncols = 0 do i=1,max_ncols ib = ib + 1 ! check whether global index is for a column that dynamics ! intends to pass to the physics curgcol = cols(ib) if (dyn_to_latlon_gcol_map(curgcol) .ne. -1) then ! yes - then save the information ncols = ncols + 1 chunks(cid)%gcol(ncols) = curgcol chunks(cid)%lon(ncols) = lon_p(curgcol) chunks(cid)%lat(ncols) = lat_p(curgcol) knuhcs(curgcol)%chunkid = cid knuhcs(curgcol)%col = ncols endif enddo chunks(cid)%ncols = ncols local_cid(smp) = local_cid(smp) + 1 enddo enddo ! ! Set number of threads available in each "SMP node". ! do p=0,npes-1 nsmpthreads(p) = npthreads(p) enddo ! endif ! ! Assign chunks to processes. ! call assign_chunks(npthreads, nsmpx, proc_smp_mapx, & nsmpthreads, nsmpchunks) ! ! Clean up ! deallocate( nsmpcolumns ) deallocate( nsmpthreads ) deallocate( nsmpchunks ) deallocate( maxcol_chk ) deallocate( maxcol_chks ) deallocate( cid_offset ) deallocate( local_cid ) deallocate( cols ) deallocate( knuhcs ) return end subroutine create_chunks ! !======================================================================== subroutine find_partners(opt, proc_busy_d, nsmpx, proc_smp_mapx) 1,10 !----------------------------------------------------------------------- ! ! Purpose: Divide processes into pairs, attempting to maximize the ! the number of columns in one process whose twins are in the ! other process. ! ! Method: The day/night and north/south hemisphere complement is defined ! to be the column twin. ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use dyn_grid, only: get_gcol_block_cnt_d, get_gcol_block_d, & get_block_owner_d use pmgrid, only: plev !------------------------------Arguments-------------------------------- integer, intent(in) :: opt ! chunking option logical, intent(in) :: proc_busy_d(0:npes-1) ! active/idle dynamics process flags integer, intent(out) :: nsmpx ! calculated number of virtual ! SMP nodes integer, intent(out) :: proc_smp_mapx(0:npes-1) ! process/virtual smp map !---------------------------Local workspace----------------------------- integer :: gcol_latlon ! physics column index (latlon sorted) integer :: twingcol_latlon ! physics column index (latlon sorted) integer :: gcol, twingcol ! physics column indices integer :: lon, lat, twinlat ! longitude and latitude indices integer :: twinlon_off ! estimate as to offset of twinlon ! on a latitude line integer :: block_cnt ! number of blocks containing data ! for a given vertical column integer :: blockids(plev+1) ! block indices integer :: bcids(plev+1) ! block column indices integer :: jb ! block index integer :: p, twp ! process indices integer :: col_proc_mapx(ngcols) ! location of columns in ! dynamics decomposition integer :: twin_proc_mapx(ngcols) ! location of column twins in ! dynamics decomposition integer :: twin_cnt(0:npes-1) ! for each process, number of twins ! in each of the other processes logical :: assigned(0:npes-1) ! flag indicating whether process ! assigned to an SMP node yet integer :: maxpartner, maxcnt ! process with maximum number of ! twins and this count logical :: error ! error flag !----------------------------------------------------------------------- ! ! Determine process location of column and its twin in dynamics decomposition ! col_proc_mapx(:) = -1 twin_proc_mapx(:) = -1 error = .false. do gcol_latlon=1,ngcols_p ! Assume latitude and longitude symmetries and that index manipulations ! are sufficient to find partners. (Will be true for lon/lat grids.) gcol = latlon_to_dyn_gcol_map(gcol_latlon) lat = lat_p(gcol) twinlat = clat_p_tot+1-lat lon = lon_p(gcol) twinlon_off = mod((lon-1)+(clat_p_cnt(twinlat)/2), clat_p_cnt(twinlat)) twingcol_latlon = clat_p_idx(twinlat) + twinlon_off twingcol = latlon_to_dyn_gcol_map(twingcol_latlon) block_cnt = get_gcol_block_cnt_d(gcol) call get_gcol_block_d(gcol,block_cnt,blockids,bcids) do jb=1,block_cnt p = get_block_owner_d(blockids(jb)) if (col_proc_mapx(gcol) .eq. -1) then col_proc_mapx(gcol) = p elseif (col_proc_mapx(gcol) .ne. p) then error = .true. endif enddo block_cnt = get_gcol_block_cnt_d(twingcol) call get_gcol_block_d(twingcol,block_cnt,blockids,bcids) do jb=1,block_cnt p = get_block_owner_d(blockids(jb)) if (twin_proc_mapx(gcol) .eq. -1) then twin_proc_mapx(gcol) = p elseif (twin_proc_mapx(gcol) .ne. p) then error = .true. endif enddo end do if (error) then if (masterproc) then write(iulog,*) "PHYS_GRID_INIT error: opt", opt, "specified, ", & "but vertical decomposition not limited to single process" endif call endrun() endif ! ! Assign process pairs to SMPs, attempting to maximize the number of column,twin ! pairs in same SMP. ! assigned(:) = .false. twin_cnt(:) = 0 nsmpx = 0 do p=0,npes-1 if ((.not. assigned(p)) .and. (proc_busy_d(p))) then ! ! For each process, determine number of twins in each of the other processes ! (running over all columns multiple times to minimize memory requirements). ! do gcol_latlon=1,ngcols_p gcol = latlon_to_dyn_gcol_map(gcol_latlon) if (col_proc_mapx(gcol) .eq. p) then twin_cnt(twin_proc_mapx(gcol)) = & twin_cnt(twin_proc_mapx(gcol)) + 1 endif enddo ! ! Find process with maximum number of twins that has not yet been designated ! a partner. ! maxpartner = -1 maxcnt = 0 do twp=0,npes-1 if ((.not. assigned(twp)) .and. (twp .ne. p)) then if (twin_cnt(twp) >= maxcnt) then maxcnt = twin_cnt(twp) maxpartner = twp endif endif enddo ! ! Assign p and twp to the same SMP node ! if (maxpartner .ne. -1) then assigned(p) = .true. assigned(maxpartner) = .true. proc_smp_mapx(p) = nsmpx proc_smp_mapx(maxpartner) = nsmpx nsmpx = nsmpx + 1 else if (masterproc) then write(iulog,*) "PHYS_GRID_INIT error: opt", opt, "specified, ", & "but could not divide processes into pairs." endif call endrun() endif ! endif ! enddo ! return end subroutine find_partners ! !======================================================================== subroutine find_twin(gcol, smp, proc_smp_mapx, twingcol_f) 1,5 !----------------------------------------------------------------------- ! ! Purpose: Find column that when paired with gcol in a chunk ! balances the load. A column is a candidate to be paired with ! gcol if it is in the same SMP node as gcol as defined ! by proc_smp_mapx. ! ! Method: The day/night and north/south hemisphere complement is ! tried first. If it is not a candidate or if it has already been ! assigned, then the day/night complement is tried next. If that ! also is not available, then nothing is returned. ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use dyn_grid, only: get_gcol_block_d, get_block_owner_d !------------------------------Arguments-------------------------------- integer, intent(in) :: gcol ! global column index for column ! seeking a twin for integer, intent(in) :: smp ! index of SMP node ! currently assigned to integer, intent(in) :: proc_smp_mapx(0:npes-1) ! process/virtual smp map integer, intent(out) :: twingcol_f ! global column index for twin !---------------------------Local workspace----------------------------- integer :: lon, lat ! global lon/lat indices for column ! seeking a twin for integer :: twinlon, twinlat ! lon/lat indices of twin candidate integer :: twinlon_off ! estimate as to offset of twinlon ! on a latitude line logical :: found ! found flag integer :: i ! loop index integer :: upper, lower ! search temporaries integer :: twingcol_latlon ! global physics column index (latlon sorted) integer :: twingcol_lonlat ! global physics column index (lonlat sorted) integer :: twingcol ! global physics column indes integer :: diff, min_diff, min_i ! search temporaries integer :: jbtwin(npes) ! global block indices integer :: ibtwin(npes) ! global column indices integer :: twinproc, twinsmp ! process and smp ids integer :: clon_p_idx(clon_p_tot) ! index in lonlat ordering for first ! occurrence of longitude corresponding to ! given latitude index real(r8):: twopi ! 2*pi real(r8):: clat, twinclat ! latitude and twin real(r8):: clon, twinclon ! longitude and twin !----------------------------------------------------------------------- twingcol_f = -1 ! precompute clon_p_idx clon_p_idx(1) = 1 do i=2,clon_p_tot clon_p_idx(i) = clon_p_idx(i-1) + clon_p_cnt(i-1) enddo ! ! Try day/night and north/south hemisphere complement first ! ! determine twin latitude lat = lat_p(gcol) clat = clat_p(lat) twinclat = -clat twinlat = clat_p_tot+1-lat if (clat_p(twinlat) .eq. twinclat) then found = .true. else found = .false. upper = twinlat lower = twinlat if (upper < clat_p_tot) upper = twinlat + 1 if (lower > 1) lower = twinlat - 1 endif do while (.not. found) if ((abs(clat_p(upper)-twinclat) < abs(clat_p(twinlat)-twinclat)) .and. & (upper .ne. twinlat)) then twinlat = upper if (upper < clat_p_tot) then upper = twinlat + 1 else found = .true. endif else if ((abs(clat_p(lower)-twinclat) < abs(clat_p(twinlat)-twinclat)) .and. & (lower .ne. twinlat)) then twinlat = lower if (lower > 1) then lower = twinlat - 1 else found = .true. endif else found = .true. endif enddo ! determine twin longitude twopi = 2.0_r8*pi lon = lon_p(gcol) clon = clon_p(lon) twinclon = mod(clon+pi,twopi) twinlon = mod((lon-1)+(clon_p_tot/2), clon_p_tot) + 1 if (clon_p(twinlon) .eq. twinclon) then found = .true. else found = .false. upper = twinlon lower = twinlon if (upper < clon_p_tot) upper = twinlon + 1 if (lower > 1) lower = twinlon - 1 endif do while (.not. found) if ((abs(clon_p(upper)-twinclon) < abs(clon_p(twinlon)-twinclon)) .and. & (upper .ne. twinlon)) then twinlon = upper if (upper < clon_p_tot) then upper = twinlon + 1 else found = .true. endif else if ((abs(clon_p(lower)-twinclon) < abs(clon_p(twinlon)-twinclon)) .and. & (lower .ne. twinlon)) then twinlon = lower if (lower > 1) then lower = twinlon - 1 else found = .true. endif else found = .true. endif enddo ! first, look for an exact match (assuming latitude and longitude symmetries) twinlon_off = mod((lon-1)+(clat_p_cnt(twinlat)/2), clat_p_cnt(twinlat)) twingcol_latlon = clat_p_idx(twinlat) + twinlon_off twingcol = latlon_to_dyn_gcol_map(twingcol_latlon) ! otherwise, look around for an approximate match using lonlat sorted indices if ((lon_p(twingcol) .ne. twinlon) .or. (lat_p(twingcol) .ne. twinlat)) then twingcol_lonlat = clon_p_idx(twinlon) twingcol = lonlat_to_dyn_gcol_map(twingcol_lonlat) min_diff = abs(lat_p(twingcol) - twinlat) min_i = 0 do i = 1, clon_p_cnt(twinlon)-1 twingcol_lonlat = clon_p_idx(twinlon)+i twingcol = lonlat_to_dyn_gcol_map(twingcol_lonlat) diff = abs(lat_p(twingcol) - twinlat) if (diff < min_diff) then min_diff = diff min_i = i endif enddo twingcol_lonlat = clon_p_idx(twinlon) + min_i twingcol = lonlat_to_dyn_gcol_map(twingcol_lonlat) endif ! Check whether twin and original are in same smp found = .false. call get_gcol_block_d(twingcol,npes,jbtwin,ibtwin) twinproc = get_block_owner_d(jbtwin(1)) twinsmp = proc_smp_mapx(twinproc) ! if ((twinsmp .eq. smp) .and. & (knuhcs(twingcol)%chunkid == -1)) then found = .true. twingcol_f = twingcol endif ! ! Try day/night complement next if (.not. found) then ! first, look for an exact match (assuming longitude symmetries) twinlon_off = mod((lon-1)+(clat_p_cnt(lat)/2), clat_p_cnt(lat)) twingcol_latlon = clat_p_idx(lat) + twinlon_off twingcol = latlon_to_dyn_gcol_map(twingcol_latlon) ! otherwise, look around for an approximate match using lonlat ! column ordering if ((lon_p(twingcol) .ne. twinlon) .or. & (lat_p(twingcol) .ne. lat)) then twingcol_lonlat = clon_p_idx(twinlon) twingcol = lonlat_to_dyn_gcol_map(twingcol_lonlat) min_diff = abs(lat_p(twingcol) - lat) min_i = 0 do i = 1, clon_p_cnt(twinlon)-1 twingcol_lonlat = clon_p_idx(twinlon)+i twingcol = lonlat_to_dyn_gcol_map(twingcol_lonlat) diff = abs(lat_p(twingcol) - lat) if (diff < min_diff) then min_diff = diff min_i = i endif enddo twingcol_lonlat = clon_p_idx(twinlon) + min_i twingcol = lonlat_to_dyn_gcol_map(twingcol_lonlat) endif ! call get_gcol_block_d(twingcol,npes,jbtwin,ibtwin) twinproc = get_block_owner_d(jbtwin(1)) twinsmp = proc_smp_mapx(twinproc) ! if ((twinsmp .eq. smp) .and. & (knuhcs(twingcol)%chunkid == -1)) then found = .true. twingcol_f = twingcol endif ! endif ! return end subroutine find_twin ! !======================================================================== subroutine assign_chunks(npthreads, nsmpx, proc_smp_mapx, & 1,5 nsmpthreads, nsmpchunks) !----------------------------------------------------------------------- ! ! Purpose: Assign chunks to processes, balancing the number of ! chunks per thread and minimizing the communication costs ! in dp_coupling subject to the restraint that columns ! do not migrate outside of the current SMP node. ! ! Method: ! ! Author: Patrick Worley ! !----------------------------------------------------------------------- use pmgrid, only: plev use dyn_grid, only: get_gcol_block_cnt_d, get_gcol_block_d,& get_block_owner_d !------------------------------Arguments-------------------------------- integer, intent(in) :: npthreads(0:npes-1) ! number of OpenMP threads per process integer, intent(in) :: nsmpx ! virtual smp count integer, intent(in) :: proc_smp_mapx(0:npes-1) ! process/virtual smp map integer, intent(in) :: nsmpthreads(0:nsmpx-1) ! number of OpenMP threads ! per virtual SMP integer, intent(in) :: nsmpchunks(0:nsmpx-1) ! number of chunks assigned ! to a given virtual SMP !---------------------------Local workspace----------------------------- integer :: i, jb, p ! loop indices integer :: cid ! chunk id integer :: smp ! SMP index integer :: curgcol ! global column index integer :: block_cnt ! number of blocks containing data ! for a given vertical column integer :: blockids(plev+1) ! block indices integer :: bcids(plev+1) ! block column indices integer :: ntsks_smpx(0:nsmpx-1) ! number of processes per virtual SMP integer :: smp_proc_mapx(0:nsmpx-1,max_nproc_smpx) ! virtual smp to process id map integer :: cid_offset(0:nsmpx) ! chunk id virtual smp offset integer :: ntmp1_smp(0:nsmpx-1) ! minimum number of chunks per thread ! in a virtual SMP integer :: ntmp2_smp(0:nsmpx-1) ! number of extra chunks to be assigned ! in a virtual SMP integer :: ntmp3_smp(0:nsmpx-1) ! number of processes in a virtual ! SMP that get more extra chunks ! than the others integer :: ntmp4_smp(0:nsmpx-1) ! number of extra chunks per process ! in a virtual SMP integer :: ntmp1, ntmp2 ! work variables ! integer :: npchunks(0:npes-1) ! number of chunks to be assigned to ! ! a given process integer :: cur_npchunks(0:npes-1) ! current number of chunks assigned ! to a given process integer :: column_count(0:npes-1) ! number of columns from current chunk ! assigned to each process in dynamics ! decomposition !----------------------------------------------------------------------- ! ! Count number of processes per virtual SMP and determine virtual SMP ! to process id map ! ntsks_smpx(:) = 0 smp_proc_mapx(:,:) = -1 do p=0,npes-1 smp = proc_smp_mapx(p) ntsks_smpx(smp) = ntsks_smpx(smp) + 1 smp_proc_mapx(smp,ntsks_smpx(smp)) = p enddo ! ! Determine chunk id ranges for each virtual SMP ! cid_offset(0) = 1 do smp=1,nsmpx cid_offset(smp) = cid_offset(smp-1) + nsmpchunks(smp-1) enddo ! ! Determine number of chunks to assign to each process ! do smp=0,nsmpx-1 ! ! Minimum number of chunks per thread ntmp1_smp(smp) = nsmpchunks(smp)/nsmpthreads(smp) ! Number of extra chunks to be assigned ntmp2_smp(smp) = mod(nsmpchunks(smp),nsmpthreads(smp)) ! Number of processes that get more extra chunks than the others ntmp3_smp(smp) = mod(ntmp2_smp(smp),ntsks_smpx(smp)) ! Number of extra chunks per process ntmp4_smp(smp) = ntmp2_smp(smp)/ntsks_smpx(smp) if (ntmp3_smp(smp) > 0) then ntmp4_smp(smp) = ntmp4_smp(smp) + 1 endif enddo do p=0,npes-1 smp = proc_smp_mapx(p) ! Update number of extra chunks if (ntmp2_smp(smp) > ntmp4_smp(smp)) then ntmp2_smp(smp) = ntmp2_smp(smp) - ntmp4_smp(smp) else ntmp4_smp(smp) = ntmp2_smp(smp) ntmp2_smp(smp) = 0 ntmp3_smp(smp) = 0 endif ! Set number of chunks npchunks(p) = ntmp1_smp(smp)*npthreads(p) + ntmp4_smp(smp) ! Update extra chunk increment if (ntmp3_smp(smp) > 0) then ntmp3_smp(smp) = ntmp3_smp(smp) - 1 if (ntmp3_smp(smp) .eq. 0) then ntmp4_smp(smp) = ntmp4_smp(smp) - 1 endif endif enddo ! ! Assign chunks to processes: ! cur_npchunks(:) = 0 ! do smp=0,nsmpx-1 do cid=cid_offset(smp),cid_offset(smp+1)-1 ! do i=1,ntsks_smpx(smp) p = smp_proc_mapx(smp,i) column_count(p) = 0 enddo ! ! For each chunk, determine number of columns in each ! process within the dynamics. do i=1,chunks(cid)%ncols curgcol = chunks(cid)%gcol(i) block_cnt = get_gcol_block_cnt_d(curgcol) call get_gcol_block_d(curgcol,block_cnt,blockids,bcids) do jb=1,block_cnt p = get_block_owner_d(blockids(jb)) column_count(p) = column_count(p) + 1 enddo enddo ! ! Eliminate processes that already have their quota of chunks do i=1,ntsks_smpx(smp) p = smp_proc_mapx(smp,i) if (cur_npchunks(p) == npchunks(p)) then column_count(p) = -1 endif enddo ! ! Assign chunk to process with most ! columns from chunk, from among those still available ntmp1 = -1 ntmp2 = -1 do i=1,ntsks_smpx(smp) p = smp_proc_mapx(smp,i) if (column_count(p) > ntmp1) then ntmp1 = column_count(p) ntmp2 = p endif enddo cur_npchunks(ntmp2) = cur_npchunks(ntmp2) + 1 chunks(cid)%owner = ntmp2 ! Update total number of columns assigned to this process gs_col_num(ntmp2) = gs_col_num(ntmp2) + chunks(cid)%ncols ! enddo ! enddo ! return end subroutine assign_chunks ! !======================================================================== !####################################################################### end module phys_grid