*dk pcom1
      subroutine pcom1(coml,comr,crnr,nj,kr,kt,ighost)
 
c...  This routine broadcasts and receives boundary information
c...  to and from the neighboring subdomains.
      include 'size.h'
      include 'pcom.h'
      real coml((kt+1)*nd*nj*(kr+1),2)
      real comr((kt+1)*nd*nj*(kr+1),2)
      real crnr(nd*nj*(kr + 1))
      integer ierror, sender
      common /nbr1/ ileft(4,0:6), iright(4,0:6)
 
c...  Increment the message number.
      msgnum = msgnum + 1
 
c...  Compute message length along subdomain edges.
      mm = (kt + 1)*nd*nj*(kr + 1)
 
c...  Broadcast row/column data in coml to two subdomains to the left
c...  and receive similar information from two subdomains to the right.
 
      do ib=1,2
         CALL MPI_IRECV( COMR( 1,IB ), MM, MPI_DATATYPE,
     &                   IRIGHT(ib,lvproc), MSGNUM, MPI_COMM_WORLD,
     &                   RECV_REQUESTS( ib ), IERROR )
       enddo
 
       do ib = 1,2
         CALL MPI_SEND( COML( 1,IB ), MM, MPI_DATATYPE,
     &                  ILEFT( IB,LVPROC ), MSGNUM,
     &                  MPI_COMM_WORLD, IERROR )
 
       enddo
 
c...  Increment the message number.
      msgnum = msgnum + 1
 
c...  Compute message length for subdomain corner.
      mm = nd*nj*(kr + 1)
 
c...  Receive right corner data to appropriate right subdomain.
      CALL MPI_IRECV( COML(1,1), MM, MPI_DATATYPE, MPI_ANY_SOURCE,
     &                MSGNUM, MPI_COMM_WORLD, RECV_REQUESTS( 3 ),
     &                IERROR )
 
c...  Send left corner data to appropriate left subdomain.
 
      iproc = ileft(3,lvproc)
      if(ighost .eq. 1) iproc = ileft(4,lvproc)
 
      CALL MPI_SEND( CRNR, MM, MPI_DATATYPE, IPROC,
     &               MSGNUM, MPI_COMM_WORLD, IERROR )
 
c...  Complete the nonblocking corner data receive before processing.
      CALL MPI_WAIT( RECV_REQUESTS(3), RECV_STATUS(:,3), IERROR )
 
c...  Copy conrner data into correct final location.
      CALL SCOPY( MM, COML(1,1), 1, CRNR, 1 )
 
c...  Complete the nonblocking upper/lower neighbor receives before processing.
      CALL MPI_WAITALL( 2, RECV_REQUESTS, RECV_STATUS(:,1:2),
     &                  IERROR )
 
      end
*dk pcom1to4
      subroutine pcom1to4(a4,a1,lva1,kr,nj)
 
c...  This routine copies array a1 that contains but one point in the
c...  lateral grid per active processor to array a4 that contains four
c...  points in the lateral grid per active processor (but on only
c...  one-fourth the number of active processors). The parameter lva1
c...  specifies the grid level of the array a1.
 
      include 'size.h'
      include 'pcom.h'
      real a1(0:1,2,nd*nj*(kr+1)), a4(0:2,3,nd*nj*(kr+1))
      integer ierror, sender
      common /bufr2/ a(0:1,2,189*nd*9)
 
c...  Compute the message length.
      mm = nd*nj*(kr+1)
      if(nj .eq. 189) mm = ndo*nj*(kr+1)
 
c...  Increment the message number.
      msgnum = msgnum + 1
 
c...  Compute the number of processors npedg1
c...  along a diamond edge for array a1.
      npedg1 = 2**lva1
 
c...  Compute the destination processor number idest.
      idest  = (mynum/(2*npedg1))*npedg1/2 + mod(mynum, npedg1)/2
 
c...  Send array a1 to the destination processor.
      CALL MPI_ISEND( A1, 4*MM, MPI_DATATYPE, IDEST, MSGNUM,
     &               MPI_COMM_WORLD, SEND_REQUESTS( 1 ), IERROR )
 
      call nulvec(a4, 9*mm)
 
      if(mynum .lt. npedg1**2*10/(4*nd)) then
 
c...     Processor mynum is an active processor for array a4.  Receive
c...     contributions from four other processors and load array a4.
 
         do iproc=1,4
 
            CALL MPI_RECV( A, 4*MM, MPI_DATATYPE, MPI_ANY_SOURCE,
     &                      MSGNUM, MPI_COMM_WORLD, RECV_STATUS(:,1),
     &                      IERROR )
 
            isrc_proc = RECV_STATUS( MPI_SOURCE, 1 )
 
c...        Compute the (i1,i2) coordinates for this contribution.
            i1 = mod(isrc_proc, 2) + 1
            i2 = mod(isrc_proc/npedg1, 2) + 1
 
c...        Load array a4.
            if(nj .ne. 189) then
               do ii=1,mm
                  a4(i1-1,i2  ,ii) = a(0,1,ii)
                  a4(i1  ,i2  ,ii) = a(1,1,ii)
                  a4(i1-1,i2+1,ii) = a(0,2,ii)
                  a4(i1  ,i2+1,ii) = a(1,2,ii)
               end do
            else
               do ii=1,mm
                  a4(i1-1,i2  ,ii) = a4(i1-1,i2  ,ii) + a(0,1,ii)
                  a4(i1  ,i2  ,ii) = a4(i1  ,i2  ,ii) + a(1,1,ii)
                  a4(i1-1,i2+1,ii) = a4(i1-1,i2+1,ii) + a(0,2,ii)
                  a4(i1  ,i2+1,ii) = a4(i1  ,i2+1,ii) + a(1,2,ii)
               end do
            endif
 
         end do
 
      endif
 
c...  Make sure the non-blocking send has completed before we leave the routine.
c...  SLS 10/15/00
 
      call MPI_WAIT( SEND_REQUESTS( 1 ), SEND_STATUS(:,1), IERROR )
 
      end
*dk pcom2
      subroutine pcom2(coml,comr,crnr,nj,kr,kt,ighost)
 
c...  This routine broadcasts and receives boundary information
c...  to and from the neighboring subdomains.
      include 'size.h'
      include 'pcom.h'
      real coml((kt+1)*nd*nj*(kr+1),2)
      real comr((kt+1)*nd*nj*(kr+1),2)
      real crnr(nd*nj*(kr + 1))
      integer ierror, sender
      common /nbr1/ ileft(4,0:6), iright(4,0:6)
 
c...  Increment the message number.
      msgnum = msgnum + 1
 
c...  Compute message length along subdomain edges.
      mm = (kt + 1)*nd*nj*(kr + 1)
 
c...  Broadcast row/column data in comr to two subdomains to the right
c...  and receive similar information from two subdomains to the left.
 
      do ib=1,2
         CALL MPI_IRECV( COML( 1,IB ), MM, MPI_DATATYPE,
     &                   ILEFT(ib,lvproc), MSGNUM, MPI_COMM_WORLD,
     &                   RECV_REQUESTS( ib ), IERROR )
       enddo
 
       do ib = 1,2
         CALL MPI_SEND( COMR( 1,IB ), MM, MPI_DATATYPE,
     &                  IRIGHT( IB,LVPROC ), MSGNUM,
     &                  MPI_COMM_WORLD, IERROR )
       enddo
 
c...  Increment the message number.
      msgnum = msgnum + 1
 
c...  Compute message length for subdomain corner.
      mm = nd*nj*(kr + 1)
 
c...  Receive left corner data to appropriate left subdomain.
      CALL MPI_IRECV( COMR(1,1), MM, MPI_DATATYPE, MPI_ANY_SOURCE,
     &                MSGNUM, MPI_COMM_WORLD, RECV_REQUESTS( 3 ),
     &                IERROR )
 
c...  Send right corner data to appropriate right subdomain.
 
      iproc = iright(3,lvproc)
      if(ighost .eq. 1) iproc = iright(4,lvproc)
 
      CALL MPI_SEND( CRNR, MM, MPI_DATATYPE, IPROC,
     &               MSGNUM, MPI_COMM_WORLD, IERROR )
 
c...  Complete the nonblocking corner data receive before processing.
      CALL MPI_WAIT( RECV_REQUESTS(3), RECV_STATUS(:,3), IERROR )
 
c...  Copy into correct location.
      CALL SCOPY( MM, COMR(1,1), 1, CRNR, 1 )
 
c...  Complete the nonblocking upper/lower neighbor data receives before processing.
      CALL MPI_WAITALL( 2, RECV_REQUESTS, RECV_STATUS(:,1:2),
     &                  IERROR )
 
      end
*dk pcom4to1
      subroutine pcom4to1(a1,a4,lva1,kr,nj)
 
c...  This routine copies array a4 that contains four points in the
c...  lateral grid per active processor to array a1 that contains but
c...  one point in the lateral grid per active processor (but on four
c...  times the number of active processors).  The parameter lva1
c...  specifies the grid level of the array a1.
 
      include 'size.h'
      include 'pcom.h'
      real a1(0:1,2,nd*nj*(kr+1)), a4(0:2,3,nd*nj*(kr+1))
      real atemp(0:1,2,nd*nj*(kr+1))
 
      integer ierror, sender
 
c...  Compute the message length.
      mm = nd*nj*(kr+1)
 
c...  Increment the message number.
      msgnum = msgnum + 1
 
c...  Receive array a1.
      CALL MPI_IRECV( A1, 4*MM, MPI_DATATYPE, MPI_ANY_SOURCE,
     &                MSGNUM, MPI_COMM_WORLD, RECV_REQUESTS( 1 ),
     &                IERROR )
 
c...  Compute the number of processors npedg1
c...  along a diamond edge for array a1.
 
      npedg1 = 2**lva1
 
      if(mynum .lt. npedg1**2*10/(4*nd)) then
 
c...     Processor mynum is an active processor for array a4.  Send
c...     contributions from array a4 to four other processors.
 
         do i2=1,2
            do i1=1,2
 
c...           Copy the appropriate elements of a4 into array atemp.
 
               do ii=1,mm
                  atemp(0,1,ii) = a4(i1-1,i2  ,ii)
                  atemp(1,1,ii) = a4(i1  ,i2  ,ii)
                  atemp(0,2,ii) = a4(i1-1,i2+1,ii)
                  atemp(1,2,ii) = a4(i1  ,i2+1,ii)
               end do
 
c...           Compute the destination processor number idest.
 
               idest = (2*mynum/npedg1)*2*npedg1 + (i2 - 1)*npedg1
     &               + mod(mynum, npedg1/2)*2 + i1 - 1
 
c...           Send array atemp to the destination processor.
 
               CALL MPI_SEND( ATEMP, 4*MM, MPI_DATATYPE, IDEST,
     &                        MSGNUM, MPI_COMM_WORLD, IERROR )
 
            end do
         end do
 
      end if
 
c...  Complete the nonblocking receive before processing.
      CALL MPI_WAIT( RECV_REQUESTS( 1 ), RECV_STATUS(:,1), IERROR )
 
      end
*dk pcomfromzero
      subroutine pcomfromzero(a,ap,nc)
 
c...  At the lowest (lv = 0) grid level, this routine takes a global
c...  array a on processor zero and distributes the appropriate portions
c...  to all the active processors as array ap.
 
      include 'size.h'
      include 'pcom.h'
      real a(4,10,nc), ap(4,nd,nc)
      integer ierror, sender
 
c...  Increment the message number.
      msgnum = msgnum + 1
 
c...  Compute the message length for the subarray.
      mm = 4*nd*nc
 
      if(mynum .eq. 0) then
 
c...     Send the appropriate portion of the global array to
c...     the other active processors.
         do iproc=1,10/nd-1
 
            do id=1,nd
               jd = id + iproc*nd
               do ii=1,4
                  do j=1,nc
                     ap(ii,id,j) = a(ii,jd,j)
                  end do
               end do
            end do
 
            CALL MPI_SEND( AP, MM, MPI_DATATYPE, IPROC,
     &                     MSGNUM, MPI_COMM_WORLD, IERROR )
 
         end do
 
c...     Now load the correct portion of the global arrary
c...     into the local array.
         do id=1,nd
            do ii=1,4
               do j=1,nc
                  ap(ii,id,j) = a(ii,id,j)
               end do
            end do
         end do
 
      else
 
c...     For all processes other than zero, receive the local
c...     array from process zero.
         CALL MPI_RECV( AP, MM, MPI_DATATYPE, MPI_ANY_SOURCE,
     &                   MSGNUM, MPI_COMM_WORLD, RECV_STATUS(:,1),
     &                   IERROR )
 
      end if
 
      end
*dk pcomptcl
      subroutine pcomptcl(ibndr,w,nsnd,nrcv)
 
c...  This routine performs interprocess communication to move
c...  particles across subdomain boundaries.
 
      include 'size.h'
      include 'pcom.h'
      parameter (nq=(nt+1)**2*nd*5)
      real w(5,(nt+1)**2*5,4)
      common /prt1/ npart, p(nq), q(nq), idp(nq), kpl(nq), ap(nq)
      common /nbr1/ ileft(4,0:6), iright(4,0:6)
      integer ibndr((nt+1)**2*5,4), nsnd(4), nrcv(4)
      integer ierror, sender
 
c...  Increment the message number.
      msgnum = msgnum + 1
 
      do ib=1,4
 
c...     Send particle data to the four neighboring subdomains.
 
         if(ib.eq.1) iproc = iright(1,lvproc)
         if(ib.eq.2) iproc =  ileft(2,lvproc)
         if(ib.eq.3) iproc =  ileft(1,lvproc)
         if(ib.eq.4) iproc = iright(2,lvproc)
 
c...     Load particle attributes into array w to be sent to a
c...     neighboring subdomain.
 
         do i=1,nsnd(ib)
            k = ibndr(i,ib)
            w(1,i,1) = p(k)
            w(2,i,1) = q(k)
            w(3,i,1) = idp(k)
            w(4,i,1) = kpl(k)
            w(5,i,1) = ap(k)
         end do
 
         CALL MPI_SEND( W( 1,1,1 ), 5*NSND( IB ),
     &                  MPI_DATATYPE, IPROC, MSGNUM,
     &                  MPI_COMM_WORLD, IERROR )
 
      end do
 
      do ib=1,4
 
c...     Receive particle data from the four neighboring subdomains.
 
         CALL MPI_PROBE( MPI_ANY_SOURCE, MSGNUM, MPI_COMM_WORLD,
     &                   MSG_STATUS, IERROR )
 
         CALL MPI_GET_COUNT( MSG_STATUS, MPI_DATATYPE, ICOUNT ,IERROR )
 
         iproc = MSG_STATUS( MPI_SOURCE )
 
         CALL MPI_IRECV( W( 1,1,IB ), ICOUNT, MPI_DATATYPE,
     &                   IPROC, MSGNUM, MPI_COMM_WORLD,
     &                   RECV_REQUESTS( 1 ), IERROR )
 
c...     Complete the nonblocking receives before processing.
         CALL MPI_WAITANY( 1, RECV_REQUESTS, SENDER, RECV_STATUS,
     &                     IERROR )
 
         nrcv(ib) = ICOUNT/5
 
      end do
 
      end
*dk pcomtozero
      subroutine pcomtozero(a,ap,nc)
 
c...  At the lowest (lv = 0) grid level, this routine loads a global
c...  array a on processor zero from contributions ap from all the
c...  other active processors.
 
      include 'size.h'
      include 'pcom.h'
      real a(4,10,nc), ap(4,nd,nc)
      integer ierror, sender
 
c...  Increment the message number.
      msgnum = msgnum + 1
 
c...  Compute the message length for the subarray.
      mm = 4*nd*nc
 
      if(mynum .ne. 0) then
 
c...     For all processes other than zero, broadcast the local
c...     array to process zero.
 
         CALL MPI_SEND( AP, MM, MPI_DATATYPE, 0, MSGNUM,
     &                  MPI_COMM_WORLD, IERROR )
 
      else
 
c...     On processor zero, first load the local array into
c...     the global array.
 
         do id=1,nd
            do ii=1,4
               do j=1,nc
                  a(ii,id,j) = ap(ii,id,j)
               end do
            end do
         end do
 
c...     Next receive the arrays from all the other processes
c...     and copy them into the global array.
 
         do iproc=1,10/nd-1
 
            CALL MPI_RECV( AP , MM, MPI_DATATYPE, MPI_ANY_SOURCE,
     &                      MSGNUM, MPI_COMM_WORLD, RECV_STATUS(:, 1 ),
     &                      IERROR )
 
c...        Save this information in array comr.
c...        Determine the "iright" value of the isrc_proc subdomain.
            jproc = RECV_STATUS( MPI_SOURCE, 1 )
 
            do id=1,nd
               jd = id + jproc*nd
               do ii=1,4
                  do j=1,nc
                     a(ii,jd,j) = ap(ii,id,j)
                  end do
               end do
            end do
 
         end do
 
      end if
 
      end
*dk pexit
      subroutine pexit
 
      CALL MPI_FINALIZE( IERROR )
 
      end
*dk pinit
      subroutine pinit
 
c...  This routine initializes the MPI processes.
 
      include 'size.h'
      include 'pcom.h'
      common /nbr1/ ileft(4,0:6), iright(4,0:6)
      integer ierror
 
c...  Set the message number.
 
      msgnum = 1
 
c...  Enroll this process in MPI.
 
      CALL MPI_INIT( IERROR )
      CALL MPI_COMM_RANK( MPI_COMM_WORLD, MYNUM, IERROR )
      CALL MPI_COMM_SIZE( MPI_COMM_WORLD, NPROC, IERROR )
 
c...  Check the number of allocated processors against the number of
c...  compiled processors.
 
      if ( nproc /= (mt/nt)**2*10/nd ) then
        if (mynum == 0) write(*,*) "The number of compiled and",
     $    " allocated processors does not agree."
        call MPI_ABORT ( MPI_COMM_WORLD, -999, IERROR)
        stop
      endif
 
c...  Determine the process numbers of neigboring subdomains.
 
      lvg = 1.45*log(real(mt/nt))
 
      do lv=0,lvg
         if(lv.ge.1 .and. mynum.lt.2**(2*lv)*10/nd) then
            call neighborid(2**lv,ileft(1,lv),iright(1,lv))
         else
            ileft(1,lv)  = mynum
            ileft(2,lv)  = mynum
            ileft(3,lv)  = mynum
            ileft(4,lv)  = mynum
            iright(1,lv) = mynum
            iright(2,lv) = mynum
            iright(3,lv) = mynum
            iright(4,lv) = mynum
         end if
         if(lv.eq.0 .and. mynum.le.1) then
            ileft(2,lv)  = 1 - mynum
            ileft(3,lv)  = 1 - mynum
            ileft(4,lv)  = 1 - mynum
            iright(2,lv) = 1 - mynum
            iright(3,lv) = 1 - mynum
            iright(4,lv) = 1 - mynum
         end if
      end do
 
      end
*dk pmax
      subroutine pmax(smax)
 
c...  This routine obtains the global maximum of the scalar
c...  quantity smax from all of the nproc processes.
 
      include 'pcom.h'
 
      qmax = smax
 
      CALL MPI_ALLREDUCE( QMAX, SMAX, 1, MPI_DATATYPE,
     &                    MPI_MAX, MPI_COMM_WORLD, IERROR )
 
      end
*dk pmin
      subroutine pmin(smin)
 
c...  This routine obtains the global minimum of the scalar
c...  quantity smin from all of the nproc processes.
 
      include 'pcom.h'
 
      qmin = smin
 
      CALL MPI_ALLREDUCE( QMIN, SMIN, 1, MPI_DATATYPE,
     &                    MPI_MIN, MPI_COMM_WORLD, IERROR )
 
      end
*dk psum
      subroutine psum(u,n)
 
c...  This routine obtains the global sum for each of the n
c...  components of array u from all of the nproc processes.
 
      include 'size.h'
      include 'pcom.h'
      real u(n), v(300)
 
      kproc = mproc*10/nd
 
      if(kproc .eq. 1) return
 
      if(mynum .lt. kproc) then
         do i=1,n
            v(i) = u(i)
         end do
      else
         do i=1,n
            v(i) = 0.
         end do
      end if
 
      CALL MPI_ALLREDUCE( V, U, N, MPI_DATATYPE,
     &                    MPI_SUM, MPI_COMM_WORLD, IERROR )
 
      end
*dk psumlong
      subroutine psumlong(u,wk,n)
 
c...  This routine obtains the global sum for each of the n
c     components of array u from all of the nproc processes.
c     The difference to routine psum lies in the explicit
c     passing in of work array wk. All computations are done
c     on processor zero. Results are passed back to the rest.
 
      include 'size.h'
      include 'pcom.h'
      real u(n), wk(n)
 
      kproc = mproc*10/nd
 
      if(kproc .eq. 1) return
 
      if(mynum .lt. kproc) then
         do i=1,n
            wk(i) = u(i)
         end do
      else
         do i=1,n
            wk(i) = 0.
         end do
      end if
 
      CALL MPI_ALLREDUCE( WK, U, N, MPI_DATATYPE,
     &                    MPI_SUM, MPI_COMM_WORLD, IERROR )
 
      end