Index: coll.tex =================================================================== --- coll.tex (revision 495) +++ coll.tex (revision 549) @@ -16,67 +16,77 @@ The functions of this type provided by \MPI/ are the following: \begin{itemize} \item -\mpifunc{MPI\_BARRIER}: +\mpifunc{MPI\_BARRIER}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IBARRIER}}: Barrier synchronization across %all group members all members of a group -(Section~\ref{sec:coll-barrier}). +(Section~\ref{sec:coll-barrier}\MPIreplace{3.0}{109}{}{ and Section~\ref{sec:nbcoll-ibarrier}}). \item -\mpifunc{MPI\_BCAST}: +\mpifunc{MPI\_BCAST}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IBCAST}}: Broadcast from one member to all members of a group -(Section~\ref{sec:coll-broadcast}). +(Section~\ref{sec:coll-broadcast}\MPIreplace{3.0}{109}{}{ and Section~\ref{sec:nbcoll-ibroadcast}}). This is shown as ``broadcast'' in Figure~\ref{fig:collcom}. \item -\mpifunc{MPI\_GATHER}, \mpifunc{MPI\_GATHERV}: +\mpifunc{MPI\_GATHER}\MPIreplace{3.0}{109}{}{, MPI\_IGATHER}, +\mpifunc{MPI\_GATHERV}, \MPIreplace{3.0}{109}{}{MPI\_IGATHERV}: Gather data from %all group members to one member all members of a group to one member -(Section~\ref{sec:coll-gather}). +(Section~\ref{sec:coll-gather}\MPIreplace{3.0}{109}{}{ and Section~\ref{sec:nbcoll-igather}}). This is shown as ``gather'' in Figure~\ref{fig:collcom}. \item -\mpifunc{MPI\_SCATTER}, \mpifunc{MPI\_SCATTERV}: +\mpifunc{MPI\_SCATTER}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_ISCATTER}}, +\mpifunc{MPI\_SCATTERV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_ISCATTERV}}: Scatter data from one member to all members of a group -(Section~\ref{sec:coll-scatter}). +(Section~\ref{sec:coll-scatter}\MPIreplace{3.0}{109}{}{ and Section \ref{sec:nbcoll-iscatter}}). This is shown as ``scatter'' in Figure~\ref{fig:collcom}. \item -\mpifunc{MPI\_ALLGATHER}, \mpifunc{MPI\_ALLGATHERV}: +\mpifunc{MPI\_ALLGATHER}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLGATHER}}, +\mpifunc{MPI\_ALLGATHERV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLGATHERV}}: A variation on Gather where all members of %the a group receive the result -(Section~\ref {sec:coll-allcast}). +(Section~\ref{sec:coll-allcast}\MPIreplace{3.0}{109}{}{ and Section~\ref{sec:nbcoll-iallcast}}). This is shown as ``allgather'' in Figure~\ref{fig:collcom}. \item -\mpifunc{MPI\_ALLTOALL}, \mpifunc{MPI\_ALLTOALLV}, \mpifunc{MPI\_ALLTOALLW}: +\mpifunc{MPI\_ALLTOALL}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLTOALL}}, +\mpifunc{MPI\_ALLTOALLV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLTOALLV}}, +\mpifunc{MPI\_ALLTOALLW}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLTOALLW}}: Scatter/Gather data from all members to all members of a group (also called complete exchange) -(Section~\ref{sec:coll-alltoall}). +(Section~\ref{sec:coll-alltoall}\MPIreplace{3.0}{109}{}{ and Section~\ref{sec:nbcoll-ialltoall}}). This is shown as ``complete exchange'' in Figure~\ref{fig:collcom}. \item -\mpifunc{MPI\_ALLREDUCE}, \mpifunc{MPI\_REDUCE}: +\mpifunc{MPI\_ALLREDUCE}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLREDUCE}}, +\mpifunc{MPI\_REDUCE}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IREDUCE}}: Global reduction operations such as sum, max, min, or user-defined functions, where the result is returned to %all group members all members of a group +\MPIreplace{3.0}{109}{}{(Section~\ref{subsec:coll-all-reduce} and Section~\ref{subsec:nbcoll-all-reduce}) } and a variation where the result is returned to only one member -(Section~\ref{global-reduce}). +(Section~\ref{global-reduce}\MPIreplace{3.0}{109}{}{ and Section~\ref{subsec:nbcoll-ireduce}}). \item -\mpifunc{MPI\_REDUCE\_SCATTER}: +\mpifunc{MPI\_REDUCE\_SCATTER}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IREDUCE\_SCATTER}, \mpifunc{MPI\_REDUCE\_SCATTER\_BLOCK}, \mpifunc{MPI\_IREDUCE\_SCATTER\_BLOCK}}: A combined reduction and scatter operation -(Section~\ref{sec:coll-reduce-scatter}). +(Section~\ref{sec:coll-reduce-scatter}\MPIreplace{3.0}{109}{}{ and Section~\ref{sec:nbcoll-reduce-scatter}}). \item -\mpifunc{MPI\_SCAN}, \mpifunc{MPI\_EXSCAN}: +\mpifunc{MPI\_SCAN}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_ISCAN}}, +\mpifunc{MPI\_EXSCAN}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IEXSCAN}}: Scan across all members of a group (also called prefix) -(Section~\ref{sec:coll-scan}). +(Section~\ref{sec:coll-scan}\MPIreplace{3.0}{109}{}{, +Section~\ref{subsec:coll-exscan}, Section~\ref{subsec:nbcoll-iscan}, and +Section~\ref{subsec:nbcoll-iexscan}}). \end{itemize} \begin{figure} @@ -128,9 +138,16 @@ type maps (the layout in memory, see Section~\ref{sec:pt2pt-datatype}) between sender and receiver are still allowed. -Collective routine calls can (but are not required to) return as soon as their -participation in the collective communication is complete. The completion -of a call indicates that the caller is now free to modify locations in the +Collective \MPIreplace{3.0}{109}{routine calls}{operations} can (but are not required to) +\MPIreplace{3.0}{109}{return}{complete} as soon as \MPIreplace{3.0}{109}{their}{the +caller's} +participation in the collective communication is +\MPIreplace{3.0}{109}{complete}{finished}. \MPIreplace{3.0}{109}{}{A blocking operation is +complete as soon as the call returns. A nonblocking (immediate) call +requires a separate completion call (cf. Section~\ref{sec:pt2pt-nonblock}).} +The completion +of a \MPIreplace{3.0}{109}{call}{collective operation} indicates that the caller +is \MPIreplace{3.0}{109}{now}{} free to modify locations in the communication buffer. It does not indicate that other processes in the group have completed or even started the operation (unless otherwise @@ -138,14 +155,19 @@ implied by % % the description of the operation). -Thus, a collective communication call may, or +% htor: the MPIreplace macro causes some LaTeX problems here :-/ +\MPIreplace{3.0}{109}{Thus, a collective communication call may, or may not, have the effect of synchronizing all calling processes. -This statement excludes, of course, the barrier function. +This statement excludes, of course, the barrier +function}{Thus, a collective communication function may, or may not, +have the effect of synchronizing all calling processes. This statement +excludes, of course, the barrier operation}. Collective communication calls may use the same communicators as point-to-point communication; \MPI/ guarantees that messages generated on behalf of collective communication calls will not be confused with messages generated by point-to-point communication. +\MPIreplace{3.0}{109}{}{The collective operations do not have a message tag argument.} A more detailed discussion of correct use of collective routines is found in Section~\ref {coll:correct}. @@ -159,13 +181,13 @@ The statements about synchronization are made so as to allow a variety of implementations of the collective functions. -The collective operations do not accept a message tag argument. +\MPIreplace{3.0}{109}{The collective operations do not accept a message tag argument. If future revisions of \MPI/ define nonblocking collective functions, then tags (or a similar mechanism) % will might need to be added so as -to allow the dis-ambiguation of multiple, pending, collective operations. +to allow the dis-ambiguation of multiple, pending, collective operations.}{} \end{rationale} \begin{users} @@ -221,7 +243,7 @@ Groups and communicators are discussed in full detail in Chapter ~\ref{chap:context}. For the purposes of this chapter, it is sufficient to know that there are two types of communicators: {\em intra-communicators} and {\em inter-communicators}. -An intracommunicator can be thought of as an indentifier for a single group of processes +An intracommunicator can be thought of as an i\MPIreplace{3.0}{109}{n}{}dentifier for a single group of processes linked with a context. An intercommunicator identifies two distinct groups of processes linked with a context. @@ -260,9 +282,9 @@ Note that \constskip{MPI\_IN\_PLACE} is a special kind of value; it has the same restrictions on its use that \consti{MPI\_BOTTOM} has. - +\MPIreplace{3.0}{109}{ Some intracommunicator collective operations do not support the ``in place'' -option (e.g., \mpifunci{MPI\_ALLTOALLV}). +option (e.g., \mpifunci{MPI\_ALLTOALLV}).}{} \end{users} %\discuss{Does anyone know if the INTENT problem can be fixed by telling a @@ -283,33 +305,42 @@ \item[All-To-All] All processes contribute to the result. All processes receive the result. \begin{itemize} - \item \mpifunc{MPI\_ALLGATHER}, \mpifunc{MPI\_ALLGATHERV} - \item \mpifunc{MPI\_ALLTOALL}, \mpifunc{MPI\_ALLTOALLV}, - \mpifunc{MPI\_ALLTOALLW} - \item \mpifunc{MPI\_ALLREDUCE}, \mpifunc{MPI\_REDUCE\_SCATTER} - \item \mpifunc{MPI\_BARRIER} + \item \mpifunc{MPI\_ALLGATHER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLGATHER},} + \mpifunc{MPI\_ALLGATHERV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLGATHERV}} + \item \mpifunc{MPI\_ALLTOALL},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLTOALL},} + \mpifunc{MPI\_ALLTOALLV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLTOALLV}}, + \mpifunc{MPI\_ALLTOALLW}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLTOALLW}} + \item \mpifunc{MPI\_ALLREDUCE},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLREDUCE}, } + \mpifunc{MPI\_REDUCE\_SCATTER}\MPIreplace{3.0}{109}{}{, + \mpifunc{MPI\_IREDUCE\_SCATTER}, + \mpifunc{MPI\_REDUCE\_SCATTER\_BLOCK}, + \mpifunc{MPI\_IREDUCE\_SCATTER\_BLOCK}} + \item \mpifunc{MPI\_BARRIER}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IBARRIER}} \end{itemize} \item[All-To-One] All processes contribute to the result. One process receives the result. \begin{itemize} - \item \mpifunc{MPI\_GATHER}, \mpifunc{MPI\_GATHERV} - \item \mpifunc{MPI\_REDUCE} + \item \mpifunc{MPI\_GATHER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IGATHER},} + \mpifunc{MPI\_GATHERV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IGATHERV}} + \item \mpifunc{MPI\_REDUCE}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IREDUCE}} \end{itemize} \item[One-To-All] One process contributes to the result. All processes receive the result. \begin{itemize} - \item \mpifunc{MPI\_BCAST} - \item \mpifunc{MPI\_SCATTER}, \mpifunc{MPI\_SCATTERV} + \item \mpifunc{MPI\_BCAST}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IBCAST}} + \item \mpifunc{MPI\_SCATTER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_ISCATTER}, } + \mpifunc{MPI\_SCATTERV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_ISCATTERV}} \end{itemize} \item[Other] Collective operations that do not fit into one of the above categories. \begin{itemize} %\item \mpifunc{MPI\_SCAN} - \item \mpifunc{MPI\_SCAN}, \mpifunc{MPI\_EXSCAN} + \item \mpifunc{MPI\_SCAN}, \MPIreplace{3.0}{109}{}{\mpifunc{MPI\_ISCAN}, } + \mpifunc{MPI\_EXSCAN}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IEXSCAN}} \end{itemize} \end{description} -The data movement patterns of \mpifunc{MPI\_SCAN} -and \mpifunc{MPI\_EXSCAN} +The data movement patterns of \mpifunc{MPI\_SCAN}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_ISCAN}} +\MPIreplace{3.0}{109}{and}{,} \mpifunc{MPI\_EXSCAN}\MPIreplace{3.0}{109}{}{, and \mpifunc{MPI\_IEXSCAN}} do not fit this taxonomy. %%%%%%%%%%%%%%%%%% @@ -343,14 +374,22 @@ % apply to intercommunicators: The following collective operations also apply to intercommunicators: \begin{itemize} -\item \mpifunc{MPI\_BARRIER}, -\item \mpifunc{MPI\_BCAST}, -\item \mpifunc{MPI\_GATHER}, \mpifunc{MPI\_GATHERV}, -\item \mpifunc{MPI\_SCATTER}, \mpifunc{MPI\_SCATTERV}, -\item \mpifunc{MPI\_ALLGATHER}, \mpifunc{MPI\_ALLGATHERV}, -\item \mpifunc{MPI\_ALLTOALL}, \mpifunc{MPI\_ALLTOALLV}, \mpifunc{MPI\_ALLTOALLW}, -\item \mpifunc{MPI\_ALLREDUCE}, \mpifunc{MPI\_REDUCE}, -\item \mpifunc{MPI\_REDUCE\_SCATTER}. +\item \mpifunc{MPI\_BARRIER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IBARRIER}} +\item \mpifunc{MPI\_BCAST},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IBCAST}} +\item \mpifunc{MPI\_GATHER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IGATHER},} + \mpifunc{MPI\_GATHERV},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IGATHERV},} +\item \mpifunc{MPI\_SCATTER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_ISCATTER},} + \mpifunc{MPI\_SCATTERV},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_ISCATTERV},} +\item \mpifunc{MPI\_ALLGATHER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLGATHER},} + \mpifunc{MPI\_ALLGATHERV},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLGATHERV},} +\item \mpifunc{MPI\_ALLTOALL},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLTOALL},} + \mpifunc{MPI\_ALLTOALLV},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLTOALLV},} + \mpifunc{MPI\_ALLTOALLW},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLTOALLW},} +\item \mpifunc{MPI\_ALLREDUCE},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLREDUCE},} + \mpifunc{MPI\_REDUCE},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IREDUCE},} +\item \mpifunc{MPI\_REDUCE\_SCATTER}\MPIreplace{3.0}{109}{}{, +\mpifunc{MPI\_IREDUCE\_SCATTER}, \mpifunc{MPI\_REDUCE\_SCATTER\_BLOCK}, +\mpifunc{MPI\_IREDUCE\_SCATTER\_BLOCK}}. \end{itemize} In C++, the bindings for these functions are in the \ctype{MPI::Comm} class. %But @@ -500,7 +539,7 @@ \section{Barrier Synchronization} \label{sec:coll-barrier} -\begin{funcdef}{MPI\_BARRIER( comm )} +\begin{funcdef}{MPI\_BARRIER(comm)} \funcarg{\IN}{comm}{communicator (handle)} \end{funcdef} @@ -532,7 +571,7 @@ \section{Broadcast} \label{sec:coll-broadcast} -\begin{funcdef}{MPI\_BCAST( buffer, count, datatype, root, comm )} +\begin{funcdef}{MPI\_BCAST(buffer, count, datatype, root, comm)} \funcarg{\INOUT}{ buffer}{starting address of buffer (choice)} \funcarg{\IN}{ count}{ number of entries in buffer (% non-negative @@ -542,7 +581,7 @@ \funcarg{\IN}{ comm}{ communicator (handle)} \end{funcdef} -\mpibind{MPI\_Bcast(void*~buffer, int~count, MPI\_Datatype~datatype, int~root, MPI\_Comm~comm )} +\mpibind{MPI\_Bcast(void*~buffer, int~count, MPI\_Datatype~datatype, int~root, MPI\_Comm~comm)} \mpifbind{MPI\_BCAST(BUFFER, COUNT, DATATYPE, ROOT, COMM, IERROR) \fargs BUFFER(*) \\ INTEGER COUNT, DATATYPE, ROOT, COMM, IERROR} % changed in MPI-2 %% \mpicppemptybind{MPI::Intracomm::Bcast(void*~buffer, int~count, const~MPI::Datatype\&~datatype, int~root) const}{void} @@ -611,7 +650,7 @@ int array[100]; int root=0; ... - MPI_Bcast( array, 100, MPI_INT, root, comm); + MPI_Bcast(array, 100, MPI_INT, root, comm); \end{verbatim} As in many of our example code fragments, we assume that some of the variables (such as {\tt comm} in the above) have been assigned @@ -621,7 +660,7 @@ \section{Gather} \label{sec:coll-gather} -\begin{funcdef}{MPI\_GATHER( sendbuf, sendcount, sendtype, recvbuf, +\begin{funcdef}{MPI\_GATHER(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, comm) } \funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)} \funcarg{\IN}{ sendcount}{ number of elements in send buffer (% @@ -719,7 +758,7 @@ buffer arguments of the processes in group B must be consistent with the receive buffer argument of the root. -\begin{funcdef}{MPI\_GATHERV( sendbuf, sendcount, sendtype, recvbuf, +\begin{funcdef}{MPI\_GATHERV(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm) } \funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)} \funcarg{\IN}{ sendcount}{ number of elements in send buffer (% @@ -839,9 +878,9 @@ int gsize,sendarray[100]; int root, *rbuf; ... - MPI_Comm_size( comm, &gsize); + MPI_Comm_size(comm, &gsize); rbuf = (int *)malloc(gsize*100*sizeof(int)); - MPI_Gather( sendarray, 100, MPI_INT, rbuf, 100, MPI_INT, root, comm); + MPI_Gather(sendarray, 100, MPI_INT, rbuf, 100, MPI_INT, root, comm); \end{verbatim} } \end{example} @@ -865,12 +904,12 @@ int gsize,sendarray[100]; int root, myrank, *rbuf; ... - MPI_Comm_rank( comm, &myrank); - if ( myrank == root) { - MPI_Comm_size( comm, &gsize); + MPI_Comm_rank(comm, &myrank); + if (myrank == root) { + MPI_Comm_size(comm, &gsize); rbuf = (int *)malloc(gsize*100*sizeof(int)); } - MPI_Gather( sendarray, 100, MPI_INT, rbuf, 100, MPI_INT, root, comm); + MPI_Gather(sendarray, 100, MPI_INT, rbuf, 100, MPI_INT, root, comm); \end{verbatim} } \end{example} @@ -916,11 +955,11 @@ int root, *rbuf; MPI_Datatype rtype; ... - MPI_Comm_size( comm, &gsize); - MPI_Type_contiguous( 100, MPI_INT, &rtype ); - MPI_Type_commit( &rtype ); + MPI_Comm_size(comm, &gsize); + MPI_Type_contiguous(100, MPI_INT, &rtype); + MPI_Type_commit(&rtype); rbuf = (int *)malloc(gsize*100*sizeof(int)); - MPI_Gather( sendarray, 100, MPI_INT, rbuf, 1, rtype, root, comm); + MPI_Gather(sendarray, 100, MPI_INT, rbuf, 1, rtype, root, comm); \end{verbatim} } \end{example} @@ -947,7 +986,7 @@ ... - MPI_Comm_size( comm, &gsize); + MPI_Comm_size(comm, &gsize); rbuf = (int *)malloc(gsize*stride*sizeof(int)); displs = (int *)malloc(gsize*sizeof(int)); rcounts = (int *)malloc(gsize*sizeof(int)); @@ -955,7 +994,7 @@ displs[i] = i*stride; rcounts[i] = 100; } - MPI_Gatherv( sendarray, 100, MPI_INT, rbuf, rcounts, displs, MPI_INT, + MPI_Gatherv(sendarray, 100, MPI_INT, rbuf, rcounts, displs, MPI_INT, root, comm); \end{verbatim} @@ -1006,7 +1045,7 @@ ... - MPI_Comm_size( comm, &gsize); + MPI_Comm_size(comm, &gsize); rbuf = (int *)malloc(gsize*stride*sizeof(int)); displs = (int *)malloc(gsize*sizeof(int)); rcounts = (int *)malloc(gsize*sizeof(int)); @@ -1016,9 +1055,9 @@ } /* Create datatype for 1 column of array */ - MPI_Type_vector( 100, 1, 150, MPI_INT, &stype); - MPI_Type_commit( &stype ); - MPI_Gatherv( sendarray, 1, stype, rbuf, rcounts, displs, MPI_INT, + MPI_Type_vector(100, 1, 150, MPI_INT, &stype); + MPI_Type_commit(&stype); + MPI_Gatherv(sendarray, 1, stype, rbuf, rcounts, displs, MPI_INT, root, comm); \end{verbatim} } \end{example} @@ -1067,8 +1106,8 @@ ... - MPI_Comm_size( comm, &gsize); - MPI_Comm_rank( comm, &myrank ); + MPI_Comm_size(comm, &gsize); + MPI_Comm_rank(comm, &myrank); rbuf = (int *)malloc(gsize*stride*sizeof(int)); displs = (int *)malloc(gsize*sizeof(int)); rcounts = (int *)malloc(gsize*sizeof(int)); @@ -1078,12 +1117,12 @@ } /* Create datatype for the column we are sending */ - MPI_Type_vector( 100-myrank, 1, 150, MPI_INT, &stype); - MPI_Type_commit( &stype ); + MPI_Type_vector(100-myrank, 1, 150, MPI_INT, &stype); + MPI_Type_commit(&stype); /* sptr is the address of start of "myrank" column */ sptr = &sendarray[0][myrank]; - MPI_Gatherv( sptr, 1, stype, rbuf, rcounts, displs, MPI_INT, + MPI_Gatherv(sptr, 1, stype, rbuf, rcounts, displs, MPI_INT, root, comm); \end{verbatim} @@ -1142,8 +1181,8 @@ ... - MPI_Comm_size( comm, &gsize); - MPI_Comm_rank( comm, &myrank ); + MPI_Comm_size(comm, &gsize); + MPI_Comm_rank(comm, &myrank); rbuf = (int *)malloc(gsize*stride*sizeof(int)); displs = (int *)malloc(gsize*sizeof(int)); rcounts = (int *)malloc(gsize*sizeof(int)); @@ -1156,10 +1195,10 @@ disp[0] = 0; disp[1] = 150*sizeof(int); type[0] = MPI_INT; type[1] = MPI_UB; blocklen[0] = 1; blocklen[1] = 1; - MPI_Type_create_struct( 2, blocklen, disp, type, &stype ); - MPI_Type_commit( &stype ); + MPI_Type_create_struct(2, blocklen, disp, type, &stype); + MPI_Type_commit(&stype); sptr = &sendarray[0][myrank]; - MPI_Gatherv( sptr, 100-myrank, stype, rbuf, rcounts, displs, MPI_INT, + MPI_Gatherv(sptr, 100-myrank, stype, rbuf, rcounts, displs, MPI_INT, root, comm); \end{Verbatim} } \end{example} @@ -1189,8 +1228,8 @@ ... - MPI_Comm_size( comm, &gsize); - MPI_Comm_rank( comm, &myrank ); + MPI_Comm_size(comm, &gsize); + MPI_Comm_rank(comm, &myrank); stride = (int *)malloc(gsize*sizeof(int)); ... @@ -1213,10 +1252,10 @@ rbuf = (int *)malloc(bufsize*sizeof(int)); /* Create datatype for the column we are sending */ - MPI_Type_vector( 100-myrank, 1, 150, MPI_INT, &stype); - MPI_Type_commit( &stype ); + MPI_Type_vector(100-myrank, 1, 150, MPI_INT, &stype); + MPI_Type_commit(&stype); sptr = &sendarray[0][myrank]; - MPI_Gatherv( sptr, 1, stype, rbuf, rcounts, displs, MPI_INT, + MPI_Gatherv(sptr, 1, stype, rbuf, rcounts, displs, MPI_INT, root, comm); \end{verbatim} } \end{example} @@ -1274,13 +1313,13 @@ ... - MPI_Comm_size( comm, &gsize); - MPI_Comm_rank( comm, &myrank ); + MPI_Comm_size(comm, &gsize); + MPI_Comm_rank(comm, &myrank); /* First, gather nums to root */ rcounts = (int *)malloc(gsize*sizeof(int)); - MPI_Gather( &num, 1, MPI_INT, rcounts, 1, MPI_INT, root, comm); + MPI_Gather(&num, 1, MPI_INT, rcounts, 1, MPI_INT, root, comm); /* root now has correct rcounts, using these we set displs[] so * that data is placed contiguously (or concatenated) at receive end */ @@ -1299,9 +1338,9 @@ type[0] = MPI_INT; type[1] = MPI_UB; blocklen[0] = 1; blocklen[1] = 1; MPI_Type_create_struct( 2, blocklen, disp, type, &stype ); - MPI_Type_commit( &stype ); + MPI_Type_commit(&stype); sptr = &sendarray[0][myrank]; - MPI_Gatherv( sptr, num, stype, rbuf, rcounts, displs, MPI_INT, + MPI_Gatherv(sptr, num, stype, rbuf, rcounts, displs, MPI_INT, root, comm); \end{Verbatim} } \end{example} @@ -1309,7 +1348,7 @@ \section{Scatter} \label{sec:coll-scatter} -\begin{funcdef}{MPI\_SCATTER( sendbuf, sendcount, sendtype, recvbuf, +\begin{funcdef}{MPI\_SCATTER(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, comm)} \funcarg{\IN}{ sendbuf}{ address of send buffer (choice, significant only at root)} @@ -1411,7 +1450,7 @@ must be consistent with the send buffer argument of the root. -\begin{funcdef}{MPI\_SCATTERV( sendbuf, sendcounts, displs, sendtype, +\begin{funcdef}{MPI\_SCATTERV(sendbuf, sendcounts, displs, sendtype, recvbuf, recvcount, recvtype, root, comm)} \funcarg{\IN}{ sendbuf}{ address of send buffer (choice, significant only at root)} @@ -1420,7 +1459,7 @@ integer array (of length group size) specifying the number of elements to send to each processor } \funcarg{\IN}{ displs}{ integer array (of length group size). Entry -{\tt i} specifies the displacement (relative to \mpiarg{sendbuf} from +{\tt i} specifies the displacement (relative to \mpiarg{sendbuf}\MPIreplace{3.0}{109}{}{)} from which to take the outgoing data to process {\tt i}} \funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)} \funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)} @@ -1526,10 +1565,10 @@ int gsize,*sendbuf; int root, rbuf[100]; ... - MPI_Comm_size( comm, &gsize); + MPI_Comm_size(comm, &gsize); sendbuf = (int *)malloc(gsize*100*sizeof(int)); ... - MPI_Scatter( sendbuf, 100, MPI_INT, rbuf, 100, MPI_INT, root, comm); + MPI_Scatter(sendbuf, 100, MPI_INT, rbuf, 100, MPI_INT, root, comm); \end{verbatim} } \end{example} @@ -1576,7 +1615,7 @@ ... - MPI_Comm_size( comm, &gsize); + MPI_Comm_size(comm, &gsize); sendbuf = (int *)malloc(gsize*stride*sizeof(int)); ... displs = (int *)malloc(gsize*sizeof(int)); @@ -1585,7 +1624,7 @@ displs[i] = i*stride; scounts[i] = 100; } - MPI_Scatterv( sendbuf, scounts, displs, MPI_INT, rbuf, 100, MPI_INT, + MPI_Scatterv(sendbuf, scounts, displs, MPI_INT, rbuf, 100, MPI_INT, root, comm); \end{verbatim} } \end{example} @@ -1634,8 +1673,8 @@ MPI_Datatype rtype; int i, *displs, *scounts, offset; ... - MPI_Comm_size( comm, &gsize); - MPI_Comm_rank( comm, &myrank ); + MPI_Comm_size(comm, &gsize); + MPI_Comm_rank(comm, &myrank); stride = (int *)malloc(gsize*sizeof(int)); ... @@ -1653,10 +1692,10 @@ } /* Create datatype for the column we are receiving */ - MPI_Type_vector( 100-myrank, 1, 150, MPI_INT, &rtype); - MPI_Type_commit( &rtype ); + MPI_Type_vector(100-myrank, 1, 150, MPI_INT, &rtype); + MPI_Type_commit(&rtype); rptr = &recvarray[0][myrank]; - MPI_Scatterv( sendbuf, scounts, displs, MPI_INT, rptr, 1, rtype, + MPI_Scatterv(sendbuf, scounts, displs, MPI_INT, rptr, 1, rtype, root, comm); \end{Verbatim} @@ -1687,7 +1726,7 @@ \section{Gather-to-all} \label{sec:coll-allcast} -\begin{funcdef}{MPI\_ALLGATHER( sendbuf, sendcount, sendtype, recvbuf, +\begin{funcdef}{MPI\_ALLGATHER(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm)} \funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)} \funcarg{\IN}{ sendcount}{ number of elements in send buffer (% @@ -1797,7 +1836,7 @@ \end{users} %\discuss{This is an ``in place'' case with replacement.} -\begin{funcdef}{MPI\_ALLGATHERV( sendbuf, sendcount, sendtype, recvbuf, +\begin{funcdef}{MPI\_ALLGATHERV(sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm)} \funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)} \funcarg{\IN}{ sendcount}{ number of elements in send buffer (% @@ -1908,9 +1947,9 @@ int gsize,sendarray[100]; int *rbuf; ... - MPI_Comm_size( comm, &gsize); + MPI_Comm_size(comm, &gsize); rbuf = (int *)malloc(gsize*100*sizeof(int)); - MPI_Allgather( sendarray, 100, MPI_INT, rbuf, 100, MPI_INT, comm); + MPI_Allgather(sendarray, 100, MPI_INT, rbuf, 100, MPI_INT, comm); \end{verbatim} After the call, every process has the group-wide concatenation of the @@ -2036,7 +2075,7 @@ integer array (of length group size) specifying the number of elements to send to each processor} \funcarg{\IN}{ sdispls}{ integer array (of length group size). Entry -{\tt j} specifies the displacement (relative to \mpiarg{sendbuf} from +{\tt j} specifies the displacement (relative to \mpiarg{sendbuf}\MPIreplace{3.0}{109}{}{)} from which to take the outgoing data destined for process {\tt j}} \funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)} \funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)} @@ -2046,7 +2085,7 @@ specifying the number of elements that can be received from each processor} \funcarg{\IN}{ rdispls}{ integer array (of length group size). Entry -{\tt i} specifies the displacement (relative to \mpiarg{recvbuf} at +{\tt i} specifies the displacement (relative to \mpiarg{recvbuf}\MPIreplace{3.0}{109}{}{)} at which to place the incoming data from process {\tt i}} \funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)} \funcarg{\IN}{ comm}{ communicator (handle)} @@ -2375,7 +2414,7 @@ \subsection{Reduce} \label{subsec:coll-reduce} -\begin{funcdef}{MPI\_REDUCE( sendbuf, recvbuf, count, datatype, op, +\begin{funcdef}{MPI\_REDUCE(sendbuf, recvbuf, count, datatype, op, root, comm)} \funcarg{\IN}{ sendbuf}{ address of send buffer (choice)} \funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice, @@ -2886,7 +2925,7 @@ in[i].val = ain[i]; in[i].rank = myrank; } - MPI_Reduce( in, out, 30, MPI_DOUBLE_INT, MPI_MAXLOC, root, comm ); + MPI_Reduce(in, out, 30, MPI_DOUBLE_INT, MPI_MAXLOC, root, comm); /* At this point, the answer resides on process root */ if (myrank == root) { @@ -2933,8 +2972,8 @@ in(2,i) = myrank ! myrank is coerced to a double END DO - CALL MPI_REDUCE( in, out, 30, MPI_2DOUBLE_PRECISION, MPI_MAXLOC, root, - comm, ierr ) + CALL MPI_REDUCE(in, out, 30, MPI_2DOUBLE_PRECISION, MPI_MAXLOC, root, + comm, ierr) ! At this point, the answer resides on process root IF (myrank .EQ. root) THEN @@ -3209,14 +3248,14 @@ case. \end{implementors} -\begin{funcdef}{MPI\_OP\_FREE( op)} +\begin{funcdef}{MPI\_OP\_FREE(op)} \funcarg{\INOUT}{op}{ operation (handle) } \end{funcdef} \cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}% -\mpibind{MPI\_op\_free( MPI\_Op~*op)} +\mpibind{MPI\_op\_free(MPI\_Op~*op)} -\mpifbind{MPI\_OP\_FREE( OP, IERROR) \fargs INTEGER OP, IERROR} +\mpifbind{MPI\_OP\_FREE(OP, IERROR) \fargs INTEGER OP, IERROR} \mpicppemptybind{MPI::Op::Free()}{void} Marks a user-defined reduction operation for deallocation and sets @@ -3248,7 +3287,7 @@ /* the user-defined function */ -void myProd( Complex *in, Complex *inout, int *len, MPI_Datatype *dptr ) +void myProd(Complex *in, Complex *inout, int *len, MPI_Datatype *dptr) { int i; Complex c; @@ -3275,13 +3314,13 @@ /* explain to MPI how type Complex is defined */ - MPI_Type_contiguous( 2, MPI_DOUBLE, &ctype ); - MPI_Type_commit( &ctype ); + MPI_Type_contiguous(2, MPI_DOUBLE, &ctype); + MPI_Type_commit(&ctype); /* create the complex-product user-op */ MPI_Op_create( myProd, 1, &myOp ); - MPI_Reduce( a, answer, 100, ctype, myOp, root, comm ); + MPI_Reduce(a, answer, 100, ctype, myOp, root, comm); /* At this point, the answer, which consists of 100 Complexes, * resides on process root @@ -3306,7 +3345,7 @@ participating in these operations receive identical results. -\begin{funcdef}{MPI\_ALLREDUCE( sendbuf, recvbuf, count, datatype, op, comm)} +\begin{funcdef}{MPI\_ALLREDUCE(sendbuf, recvbuf, count, datatype, op, comm)} \funcarg{\IN}{sendbuf}{ starting address of send buffer (choice)} \funcarg{\OUT}{recvbuf}{ starting address of receive buffer (choice)} \funcarg{\IN}{count}{ number of elements in send buffer (% @@ -3613,7 +3652,7 @@ \label{sec:coll-scan} \subsection{Inclusive Scan} -\begin{funcdef}{MPI\_SCAN( sendbuf, recvbuf, count, datatype, op, comm )} +\begin{funcdef}{MPI\_SCAN(sendbuf, recvbuf, count, datatype, op, comm)} \funcarg{\IN}{sendbuf}{starting address of send buffer (choice)} \funcarg{\OUT}{recvbuf}{ starting address of receive buffer (choice)} \funcarg{\IN}{count}{ number of elements in input buffer (% @@ -3625,7 +3664,7 @@ \end{funcdef} \cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}% -\mpibind{MPI\_Scan(void*~sendbuf, void*~recvbuf, int~count, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm )} +\mpibind{MPI\_Scan(void*~sendbuf, void*~recvbuf, int~count, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm)} \mpifbind{MPI\_SCAN(SENDBUF, RECVBUF, COUNT, DATATYPE, OP, COMM, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER COUNT, DATATYPE, OP, COMM, IERROR} \mpicppemptybind{MPI::Intracomm::Scan(const void*~sendbuf, void*~recvbuf, int~count, const MPI::Datatype\&~datatype, const~MPI::Op\&~op) const}{void} @@ -3705,6 +3744,7 @@ %} \subsection{Exclusive Scan} +\label{subsec:coll-exscan} \label{coll-exscan} % Sect. 5.11.2 p.175 newlabel \status{Passed twice} @@ -3841,14 +3881,14 @@ /* the user-defined function */ -void segScan( SegScanPair *in, SegScanPair *inout, int *len, - MPI_Datatype *dptr ) +void segScan(SegScanPair *in, SegScanPair *inout, int *len, + MPI_Datatype *dptr) { int i; SegScanPair c; for (i=0; i< *len; ++i) { - if ( in->log == inout->log ) + if (in->log == inout->log) c.val = in->val + inout->val; else c.val = inout->val; @@ -3890,13 +3930,791 @@ MPI_Type_commit( &sspair ); /* create the segmented-scan user-op */ - MPI_Op_create( segScan, 0, &myOp ); + MPI_Op_create(segScan, 0, &myOp); ... MPI_Scan( &a, &answer, 1, sspair, myOp, comm ); \end{Verbatim} } \end{example} + + + +\section{Nonblocking Collective Operations} +As described in Section~\ref{sec:pt2pt-nonblock}, performance of many +applications can be improved by overlapping communication and +computation, and many systems enable this. Nonblocking +collective operations combine the potential benefits of nonblocking +point-to-point operations, to exploit overlap and to avoid +synchronization, with the optimized implementation and message +scheduling provided by collective +operations~\cite{hoefler-app-parco,hoefler-europvm08-osem}. One way of +doing this would be to perform a blocking collective operation in a +separate thread. An alternative mechanism that often leads to better +performance (e.g., avoids context switching, scheduler overheads, and +thread management) is to use nonblocking collective +communication~\cite{hoefler-ib-threads}. + +The nonblocking collective communication model is similar to the model +used for nonblocking point-to-point communication. A nonblocking +call initiates a collective operation, which must be +completed in a separate completion call. +Once initiated, the operation may progress independently of any +computation or other communication at participating processes. In this +manner, nonblocking collective operations can mitigate possible +synchronizing effects of collective operations by running them in the +``background.'' +In addition to enabling communication-computation +overlap, nonblocking collective operations can perform +collective operations on overlapping communicators, which would lead to +deadlocks with blocking operations. Their semantic advantages can also be +useful in combination with point-to-point communication. + +As in the nonblocking point-to-point case, all calls are local and +return immediately, irrespective of the status of other processes. The +call initiates the operation, which indicates that the system may +start to copy data out of the send buffer and into the receive buffer. +Once intiated, all associated send buffers should not be modified and +all associated receive buffers should not be accessed until the +collective operation completes. The +call returns a request handle, which must be passed to a +completion call. + +All completion calls (e.g., \mpifunc{MPI\_WAIT}) described in Section +\ref{subsec:pt2pt-commend} are supported for nonblocking collective +operations. Similarly to the blocking case, nonblocking collective operations are +considered to be complete when the local part of the operation is +finished, i.e., for the caller, the semantics of the operation are +guaranteed and all buffers can be safely accessed and modified. +Completion does not indicate that other processes have completed or even +started the operation (unless otherwise implied by the description of +the operation). Completion of a particular nonblocking collective +operation also does not indicate completion of any other posted +nonblocking collective (or send-receive) operations, whether they are +posted before or after the completed operation. + +\begin{users} +Users should be aware that implementations are allowed, but not required +(with exception of \mpifunc{MPI\_IBARRIER}), to synchronize processes +during the completion of a nonblocking collective operation. +\end{users} + +Upon returning from a completion call in which a nonblocking collective +operation completes, the \mpifunc{MPI\_ERROR} field in the associated +status object is set appropriately. The values of +the \mpifunc{MPI\_SOURCE} and \mpifunc{MPI\_TAG} fields are undefined. +It is valid to mix different request types (i.e., any combination of +collective requests, I/O requests, generalized requests, or +point-to-point requests) in functions that enable multiple completions +(e.g., \mpifunc{MPI\_WAITALL}). It is erroneous to call +\mpifunc{MPI\_REQUEST\_FREE} or \mpifunc{MPI\_CANCEL} for a request +associated with a nonblocking collective operation. +Nonblocking collective requests are not persistent. + +\begin{rationale} +Freeing an active nonblocking collective request could cause similar +problems as discussed for point-to-point requests (see Section~\ref{subsec:pt2pt-commend}). +Cancelling a request is not supported because the semantics of this +operation are not well-defined. +\end{rationale} + + +Multiple nonblocking collective +operations can be outstanding on a single communicator. +If the nonblocking call causes some system resource to be exhausted, +then it will fail and generate an MPI exception. Quality implementations +of MPI should ensure that this happens only in pathological cases. +That is, an MPI implementation should be able to support a large number +of pending nonblocking operations. + +Unlike point-to-point operations, nonblocking collective operations do +not match with blocking collective operations, and collective operations +do not have a tag argument. All processes must call collective +operations (blocking and nonblocking) in the same order per +communicator. In particular, once a process calls a collective +operation, all other processes in the communicator must eventually +call the same collective operation, and no other collective operation +with the same communicator +in between. This is consistent with the ordering rules for blocking +collective operations in threaded environments. + +\begin{rationale} +Matching blocking and nonblocking collective operations is not allowed +because the implementation might use different communication algorithms +for the two cases. Blocking collective operations may be optimized +for minimal time to completion, while nonblocking collective operations +may balance time to completion with CPU overhead and asynchronous +progression. + +The use of tags for collective operations can prevent certain hardware +optimizations. +\end{rationale} + +\begin{users} +If program semantics require matching blocking and nonblocking +collective operations, then a nonblocking collective operation can be +initiated and immediately completed with a blocking wait to emulate +blocking behavior. +\end{users} + +In terms of data movements, each nonblocking collective operation has +the same effect as its blocking counterpart for intracommunicators and +intercommunicators after completion. Likewise, upon completion, +nonblocking collective reduction operations have the same effect as +their blocking counterparts, and the same restrictions and +recommendations on reduction orders apply. + +The use of the ``in place'' option is allowed exactly as described for +the corresponding blocking collective operations. When using the ``in +place'' option, message buffers function as both send and receive +buffers. Such buffers should not be modified or accessed until the +operation completes. + +Progression rules for nonblocking collective operations are similar to +progression of nonblocking point-to-point operations, refer to +Section~\ref{subsec:pt2pt-semantics}. + + +\begin{implementors} +Nonblocking collective operations can be implemented with local +execution schedules~\cite{hoefler-sc07} using nonblocking point-to-point +communication and a reserved tag-space. +\end{implementors} + + +\subsection{Nonblocking Barrier Synchronization} +\label{sec:nbcoll-ibarrier} + +\begin{funcdef}{MPI\_IBARRIER(comm , request)} +\funcarg{\IN}{comm}{communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\mpibind{MPI\_Ibarrier(MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_IBARRIER(COMM, REQUEST, IERROR) \fargs INTEGER COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Ibarrier() const~=~0}{MPI::Request} + + + + +\mpifunc{MPI\_IBARRIER} is a nonblocking version of +\mpifunc{MPI\_BARRIER}. By calling \mpifunc{MPI\_IBARRIER}, a process +notifies that it has reached the barrier. The call returns immediately, +independent of whether other processes have called \mpifunc{MPI\_IBARRIER}. +The usual barrier semantics are enforced at the corresponding completion +operation (test or wait), which in the intracommunicator case will +complete only after all other processes in the communicator have called +\mpifunc{MPI\_IBARRIER}. In the intercommunicator case, it will complete +when all processes in the remote group have called +\mpifunc{MPI\_IBARRIER}. + +\begin{users} +A nonblocking barrier can be used to hide latency. Moving +independent computations between the \mpifunc{MPI\_IBARRIER} and the +subsequent completion call can overlap the barrier latency and therefore +shorten possible waiting times. The semantic properties are also useful +when mixing collective operations and point-to-point messages. +\end{users} + + +\subsection{Nonblocking Broadcast} +\label{sec:nbcoll-ibroadcast} + +\begin{funcdef}{MPI\_IBCAST(buffer, count, datatype, root, comm, request)} +\funcarg{\INOUT}{ buffer}{starting address of buffer (choice)} +\funcarg{\IN}{ count}{ number of entries in buffer (% +non-negative +integer)} +\funcarg{\IN}{ datatype}{ data type of buffer (handle)} +\funcarg{\IN}{ root}{ rank of broadcast root (integer)} +\funcarg{\IN}{ comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\mpibind{MPI\_Ibcast(void*~buffer, int~count, MPI\_Datatype~datatype, int~root, MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_IBCAST(BUFFER, COUNT, DATATYPE, ROOT, COMM, REQUEST, IERROR) \fargs BUFFER(*) \\ INTEGER COUNT, DATATYPE, ROOT, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Ibcast(void*~buffer, int~count, const~MPI::Datatype\&~datatype, int~root) const~=~0}{MPI::Request} + + + + + + +This call starts a nonblocking variant of \mpifunc{MPI\_BCAST} (see +Section~\ref{sec:coll-broadcast}). + +\subsubsection{Example using \func{MPI\_IBCAST}} + +The example in this section uses intracommunicators. + +\begin{example} {\rm +\label{coll-exZ} +\exindex{MPI\_Bcast} + +Start a broadcast of 100 {\tt int}s from process {\tt 0} to every process in the +group, perform some computation on independent data, and then complete +the outstanding broadcast operation. + +\begin{verbatim} + MPI_Comm comm; + int array1[100], array2[100]; + int root=0; + MPI_Request req; + ... + MPI_Ibcast(array1, 100, MPI_INT, root, comm, &req); + compute(array2, 100); + MPI_Wait(&req, MPI_STATUS_IGNORE); +\end{verbatim} +} \end{example} + + +\subsection{Nonblocking Gather} +\label{sec:nbcoll-igather} + +\begin{funcdef}{MPI\_IGATHER(sendbuf, sendcount, sendtype, recvbuf, +recvcount, recvtype, root, comm, request) } +\funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)} +\funcarg{\IN}{ sendcount}{ number of elements in send buffer (% +non-negative +integer)} +\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)} +\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice, +significant only at root)} +\funcarg{\IN}{ recvcount}{ number of elements for any single receive (% +non-negative +integer, significant only at root)} +\funcarg{\IN}{ recvtype}{ data type of recv buffer elements +(significant only at root) (handle)} +\funcarg{\IN}{ root}{ rank of receiving process (integer)} +\funcarg{\IN}{ comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\mpibind{MPI\_Igather(void*~sendbuf, int~sendcount, MPI\_Datatype~sendtype, void*~recvbuf, int~recvcount, MPI\_Datatype~recvtype, int~root, MPI\_Comm~comm, MPI\_Request~*request) } + +\mpifbind{MPI\_IGATHER(SENDBUF, SENDCOUNT, SENDTYPE, RECVBUF, RECVCOUNT, RECVTYPE, ROOT, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNT, SENDTYPE, RECVCOUNT, RECVTYPE, ROOT, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Igather(const void*~sendbuf, int~sendcount, const MPI::Datatype\&~sendtype, void*~recvbuf, int~recvcount, const~MPI::Datatype\&~recvtype, int~root) const~=~0}{MPI::Request} + + +This call starts a nonblocking variant of \mpifunc{MPI\_GATHER} (see +Section~\ref{sec:coll-gather}). + +\begin{funcdef}{MPI\_IGATHERV(sendbuf, sendcount, sendtype, recvbuf, +recvcounts, displs, recvtype, root, comm, request) } +\funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)} +\funcarg{\IN}{ sendcount}{ number of elements in send buffer (% +non-negative +integer)} +\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)} +\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice, +significant only at root)} +\funcarg{\IN}{ recvcounts}{% +non-negative +integer array (of length group size) +containing the number of elements that are received from each process +(significant only at root)} +\funcarg{\IN}{ displs}{ integer array (of length group size). Entry +{\tt i} specifies the displacement relative to \mpiarg{recvbuf} at +which to place the incoming data from process {\tt i} (significant only +at root)} +\funcarg{\IN}{ recvtype}{ data type of recv buffer elements +(significant only at root) (handle)} +\funcarg{\IN}{ root}{ rank of receiving process (integer)} +\funcarg{\IN}{ comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\mpibind{MPI\_Igatherv(void*~sendbuf, int~sendcount, MPI\_Datatype~sendtype, void*~recvbuf, int~*recvcounts, int~*displs, MPI\_Datatype~recvtype, int~root, MPI\_Comm~comm, MPI\_Request~*request) } + +\mpifbind{MPI\_IGATHERV(SENDBUF, SENDCOUNT, SENDTYPE, RECVBUF, RECVCOUNTS, DISPLS, RECVTYPE, ROOT, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNT, SENDTYPE, RECVCOUNTS(*), DISPLS(*), RECVTYPE, ROOT, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Igatherv(const void*~sendbuf, int~sendcount, const MPI::Datatype\&~sendtype, void*~recvbuf, const~int~recvcounts[], const~int~displs[], const~MPI::Datatype\&~recvtype, int~root) const~=~0}{MPI::Request} + + + +This call starts a nonblocking variant of \mpifunc{MPI\_GATHERV} (see +Section~\ref{sec:coll-gather}). + + + +\subsection{Nonblocking Scatter} +\label{sec:nbcoll-iscatter} + +\begin{funcdef}{MPI\_ISCATTER(sendbuf, sendcount, sendtype, recvbuf, +recvcount, recvtype, root, comm, request)} +\funcarg{\IN}{ sendbuf}{ address of send buffer (choice, significant +only at root)} +\funcarg{\IN}{ sendcount}{ number of elements sent to each process (% +non-negative +integer, significant only at root)} +\funcarg{\IN}{ sendtype}{ data type of send buffer elements +(significant only at root) (handle)} +\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)} +\funcarg{\IN}{ recvcount}{ number of elements in receive buffer (% +non-negative +integer)} +\funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)} +\funcarg{\IN}{ root}{ rank of sending process (integer)} +\funcarg{\IN}{ comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\mpibind{MPI\_Iscatter(void*~sendbuf, int~sendcount, MPI\_Datatype~sendtype, void*~recvbuf, int~recvcount, MPI\_Datatype~recvtype, int~root, MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_ISCATTER(SENDBUF, SENDCOUNT, SENDTYPE, RECVBUF, RECVCOUNT, RECVTYPE, ROOT, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNT, SENDTYPE, RECVCOUNT, RECVTYPE, ROOT, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Iscatter(const void*~sendbuf, int~sendcount, const MPI::Datatype\&~sendtype, void*~recvbuf, int~recvcount, const~MPI::Datatype\&~recvtype, int~root) const~=~0}{MPI::Request} + + + +This call starts a nonblocking variant of \mpifunc{MPI\_SCATTER} (see +Section~\ref{sec:coll-scatter}). + + + +\begin{funcdef}{MPI\_ISCATTERV(sendbuf, sendcounts, displs, sendtype, +recvbuf, recvcount, recvtype, root, comm, request)} +\funcarg{\IN}{ sendbuf}{ address of send buffer (choice, significant +only at root)} +\funcarg{\IN}{ sendcounts}{% +non-negative +integer array (of length group size) +specifying the number of elements to send to each processor } +\funcarg{\IN}{ displs}{ integer array (of length group size). Entry +{\tt i} specifies the displacement (relative to \mpiarg{sendbuf}) from +which to take the outgoing data to process {\tt i}} +\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)} +\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)} +\funcarg{\IN}{ recvcount}{ number of elements in receive buffer (% +non-negative +integer)} +\funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)} +\funcarg{\IN}{ root}{ rank of sending process (integer)} +\funcarg{\IN}{ comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\mpibind{MPI\_Iscatterv(void*~sendbuf, int~*sendcounts, int~*displs, MPI\_Datatype~sendtype, void*~recvbuf, int~recvcount, MPI\_Datatype~recvtype, int~root, MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_ISCATTERV(SENDBUF, SENDCOUNTS, DISPLS, SENDTYPE, RECVBUF, RECVCOUNT, RECVTYPE, ROOT, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNTS(*), DISPLS(*), SENDTYPE, RECVCOUNT, RECVTYPE, ROOT, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Iscatterv(const void*~sendbuf, const~int~sendcounts[], const~int~displs[], const~MPI::Datatype\&~sendtype, void*~recvbuf, int~recvcount, const~MPI::Datatype\&~recvtype, int~root) const~=~0}{MPI::Request} + + + +This call starts a nonblocking variant of \mpifunc{MPI\_SCATTERV} (see +Section~\ref{sec:coll-scatter}). + + +\subsection{Nonblocking Gather-to-all} +\label{sec:nbcoll-iallcast} + +\begin{funcdef}{MPI\_IALLGATHER(sendbuf, sendcount, sendtype, recvbuf, +recvcount, recvtype, comm, request)} +\funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)} +\funcarg{\IN}{ sendcount}{ number of elements in send buffer (% +non-negative +integer)} +\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)} +\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)} +\funcarg{\IN}{ recvcount}{ number of elements received from any process (% +non-negative +integer)} +\funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)} +\funcarg{\IN}{ comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\mpibind{MPI\_Iallgather(void*~sendbuf, int~sendcount, MPI\_Datatype~sendtype, void*~recvbuf, int~recvcount, MPI\_Datatype~recvtype, MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_IALLGATHER(SENDBUF, SENDCOUNT, SENDTYPE, RECVBUF, RECVCOUNT, RECVTYPE, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNT, SENDTYPE, RECVCOUNT, RECVTYPE, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Iallgather(const void*~sendbuf, int~sendcount, const MPI::Datatype\&~sendtype, void*~recvbuf, int~recvcount, const~MPI::Datatype\&~recvtype) const~=~0}{MPI::Request} + + +This call starts a nonblocking variant of \mpifunc{MPI\_ALLGATHER} +(see Section~\ref{sec:coll-allcast}). + + +\begin{funcdef}{MPI\_IALLGATHERV(sendbuf, sendcount, sendtype, recvbuf, +recvcounts, displs, recvtype, comm, request)} +\funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)} +\funcarg{\IN}{ sendcount}{ number of elements in send buffer (% +non-negative +integer)} +\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)} +\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)} +\funcarg{\IN}{ recvcounts}{% +non-negative +integer array (of length group size) +containing the number of elements that are received from each process} +\funcarg{\IN}{ displs}{ integer array (of length group size). Entry +{\tt i} specifies the displacement (relative to \mpiarg{recvbuf}) at +which to place the incoming data from process {\tt i}} +\funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)} +\funcarg{\IN}{ comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\mpibind{MPI\_Iallgatherv(void*~sendbuf, int~sendcount, +MPI\_Datatype~sendtype, void*~recvbuf, int~*recvcounts, int~*displs, +MPI\_Datatype~recvtype, MPI\_Comm~comm, MPI\_Request *request)} + +\mpifbind{MPI\_IALLGATHERV(SENDBUF, SENDCOUNT, SENDTYPE, RECVBUF, RECVCOUNTS, DISPLS, RECVTYPE, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNT, SENDTYPE, RECVCOUNTS(*), DISPLS(*), RECVTYPE, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Iallgatherv(const void*~sendbuf, int~sendcount, const MPI::Datatype\&~sendtype, void*~recvbuf, const~int~recvcounts[], const~int~displs[], const~MPI::Datatype\&~recvtype) const~=~0}{MPI::Request} + + +This call starts a nonblocking variant of \mpifunc{MPI\_ALLGATHERV} (see +Section~\ref{sec:coll-allcast}). + + + +\subsection{Nonblocking All-to-All Scatter/Gather} +\label{sec:nbcoll-ialltoall} + +\begin{funcdef}{MPI\_IALLTOALL(sendbuf, sendcount, sendtype, recvbuf, +recvcount, recvtype, comm, request)} +\funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)} +\funcarg{\IN}{ sendcount}{ number of elements sent to each process (% +non-negative +integer)} +\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)} +\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)} +\funcarg{\IN}{ recvcount}{ number of elements received from any process (% +non-negative +integer)} +\funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)} +\funcarg{\IN}{ comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\mpibind{MPI\_Ialltoall(void*~sendbuf, int~sendcount, MPI\_Datatype~sendtype, void*~recvbuf, int~recvcount, MPI\_Datatype~recvtype, MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_IALLTOALL(SENDBUF, SENDCOUNT, SENDTYPE, RECVBUF, RECVCOUNT, RECVTYPE, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNT, SENDTYPE, RECVCOUNT, RECVTYPE, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Ialltoall(const void*~sendbuf, int~sendcount, const MPI::Datatype\&~sendtype, void*~recvbuf, int~recvcount, const~MPI::Datatype\&~recvtype) const~=~0}{MPI::Request} + + +This call starts a nonblocking variant of \mpifunc{MPI\_ALLTOALL} (see +Section~\ref{sec:coll-alltoall}). + +\begin{funcdef}{MPI\_IALLTOALLV(sendbuf, sendcounts, sdispls, sendtype, +recvbuf, recvcounts, rdispls, recvtype, comm, request)} +\funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)} +\funcarg{\IN}{ sendcounts}{% +non-negative +integer array (of length group size) +specifying the number of elements to send to each processor} +\funcarg{\IN}{ sdispls}{ integer array (of length group size). Entry +{\tt j} specifies the displacement (relative to \mpiarg{sendbuf}) from +which to take the outgoing data destined for process {\tt j}} +\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)} +\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)} +\funcarg{\IN}{ recvcounts}{% +non-negative +integer array (of length group size) +specifying the number of elements that can be received from +each processor} +\funcarg{\IN}{ rdispls}{ integer array (of length group size). Entry +{\tt i} specifies the displacement (relative to \mpiarg{recvbuf}) at +which to place the incoming data from process {\tt i}} +\funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)} +\funcarg{\IN}{ comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\mpibind{MPI\_Ialltoallv(void*~sendbuf, int~*sendcounts, int~*sdispls, MPI\_Datatype~sendtype, void*~recvbuf, int~*recvcounts, int~*rdispls, MPI\_Datatype~recvtype, MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_IALLTOALLV(SENDBUF, SENDCOUNTS, SDISPLS, SENDTYPE, RECVBUF, RECVCOUNTS, RDISPLS, RECVTYPE, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNTS(*), SDISPLS(*), SENDTYPE, RECVCOUNTS(*), RDISPLS(*), RECVTYPE, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Ialltoallv(const void*~sendbuf, const~int~sendcounts[], const~int~sdispls[], const~MPI::Datatype\&~sendtype, void*~recvbuf, const~int~recvcounts[], const~int~rdispls[], const~MPI::Datatype\&~recvtype) const~=~0}{MPI::Request} + + +This call starts a nonblocking variant of \mpifunc{MPI\_ALLTOALLV} (see +Section~\ref{sec:coll-alltoall}). + + +\begin{funcdef}{MPI\_IALLTOALLW(sendbuf, sendcounts, sdispls, sendtypes, +recvbuf, recvcounts, rdispls, recvtypes, comm, request)} +\funcarg{\IN}{sendbuf}{starting address of send buffer (choice)} +\funcarg{\IN}{sendcounts}{integer array (of length group size) specifying the +number of elements to send to each processor (array of +non-negative +integers)} +\funcarg{\IN}{sdispls}{integer array (of length group size). Entry {\tt j} specifies +the displacement in bytes (relative to \mpiarg{sendbuf}) from which to take +the outgoing data destined for process {\tt j} (array of integers)} +\funcarg{\IN}{sendtypes}{array of datatypes (of length group size). Entry {\tt j} +specifies the type of data to send to process {\tt j} (array of handles)} +\funcarg{\OUT}{recvbuf}{address of receive buffer (choice)} +\funcarg{\IN}{recvcounts}{integer array (of length group size) specifying the +number of elements that can be received from each processor (array of +non-negative +integers)} +\funcarg{\IN}{rdispls}{integer array (of length group size). Entry {\tt i} specifies +the displacement in bytes (relative to \mpiarg{recvbuf}) at which to place the +incoming data from process {\tt i} (array of integers)} +\funcarg{\IN}{recvtypes}{array of datatypes (of length group size). Entry {\tt i} +specifies the type of data received from process {\tt i} (array of handles)} +\funcarg{\IN}{comm}{communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\mpibind{MPI\_Ialltoallw(void~*sendbuf, int~sendcounts[], int~sdispls[], MPI\_Datatype~sendtypes[], void~*recvbuf, int~recvcounts[], int~rdispls[], MPI\_Datatype~recvtypes[], MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_IALLTOALLW(SENDBUF, SENDCOUNTS, SDISPLS, SENDTYPES, RECVBUF, RECVCOUNTS, RDISPLS, RECVTYPES, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*)\\INTEGER SENDCOUNTS(*), SDISPLS(*), SENDTYPES(*), RECVCOUNTS(*), RDISPLS(*), RECVTYPES(*), COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Ialltoallw(const void* sendbuf, const int sendcounts[], const int sdispls[], const MPI::Datatype sendtypes[], void* recvbuf, const int recvcounts[], const int rdispls[], const MPI::Datatype recvtypes[]) const~=~0}{MPI::Request} + + + +This call starts a nonblocking variant of \mpifunc{MPI\_ALLTOALLW} (see +Section~\ref{sec:coll-alltoall}). + +\subsection{Nonblocking Reduce} +\label{subsec:nbcoll-ireduce} + +\begin{funcdef}{MPI\_IREDUCE(sendbuf, recvbuf, count, datatype, op, +root, comm, request)} +\funcarg{\IN}{ sendbuf}{ address of send buffer (choice)} +\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice, +significant only at root)} +\funcarg{\IN}{ count}{ number of elements in send buffer (% +non-negative +integer)} +\funcarg{\IN}{ datatype}{ data type of elements of send buffer (handle)} +\funcarg{\IN}{ op}{ reduce operation (handle)} +\funcarg{\IN}{ root}{ rank of root process (integer)} +\funcarg{\IN}{ comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}% +\mpibind{MPI\_Ireduce(void*~sendbuf, void*~recvbuf, int~count, MPI\_Datatype~datatype, MPI\_Op~op, int~root, MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_IREDUCE(SENDBUF, RECVBUF, COUNT, DATATYPE, OP, ROOT, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER COUNT, DATATYPE, OP, ROOT, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Ireduce(const void*~sendbuf, void*~recvbuf, int~count, const~MPI::Datatype\&~datatype, const~MPI::Op\&~op, int~root) const~=~0}{MPI::Request} + + +This call starts a nonblocking variant of \mpifunc{MPI\_REDUCE} (see +Section~\ref{subsec:coll-reduce}). + + +\begin{implementors} +The implementation is explicitly allowed to use different algorithms for +blocking and nonblocking reduction operations that might change the +order of evaluation of the operations. However, as for +\mpifunc{MPI\_REDUCE}, it is strongly recommended that +\mpifunc{MPI\_IREDUCE} be implemented so that the same result be +obtained whenever the function is applied on the same arguments, +appearing in the same order. Note that this may prevent optimizations +that take advantage of the physical location of processes. +\end{implementors} + +\begin{users} +For operations which are not truly associative, the result delivered +upon completion of the nonblocking reduction may not exactly equal the +result delivered by the blocking reduction, even when specifying the +same arguments in the same order. +\end{users} + + + +\subsection{Nonblocking All-Reduce} +\label{subsec:nbcoll-all-reduce} + +\MPI/ includes +a variant +of the reduce operations +where the result is returned to all processes in +a +group. +\MPI/ requires that all processes +from the same group +participating in these operations +receive identical results. + +\begin{funcdef}{MPI\_IALLREDUCE(sendbuf, recvbuf, count, datatype, op, comm, request)} +\funcarg{\IN}{sendbuf}{ starting address of send buffer (choice)} +\funcarg{\OUT}{recvbuf}{ starting address of receive buffer (choice)} +\funcarg{\IN}{count}{ number of elements in send buffer (% +non-negative +integer)} +\funcarg{\IN}{datatype}{ data type of elements of send buffer (handle)} +\funcarg{\IN}{op}{ operation (handle)} +\funcarg{\IN}{comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}% +\mpibind{MPI\_Iallreduce(void*~sendbuf, void*~recvbuf, int~count, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_IALLREDUCE(SENDBUF, RECVBUF, COUNT, DATATYPE, OP, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER COUNT, DATATYPE, OP, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Iallreduce(const void*~sendbuf, void*~recvbuf, int~count, const MPI::Datatype\&~datatype, const~MPI::Op\&~op) const~=~0}{MPI::Request} + + +This call starts a nonblocking variant of \mpifunc{MPI\_ALLREDUCE} (see +Section~\ref{subsec:coll-all-reduce}). + + + + + +\subsection{Nonblocking Reduce-Scatter with Equal Blocks} +\label{sec:nbcoll-reduce-scatter} + +\begin{funcdef}{MPI\_IREDUCE\_SCATTER\_BLOCK(sendbuf, recvbuf, recvcount, +datatype, op, comm, request)} +\funcarg{\IN}{sendbuf}{ starting address of send buffer (choice)} +\funcarg{\OUT}{recvbuf}{ starting address of receive buffer (choice)} +\funcarg{\IN}{recvcount}{ element count per block (non-negative integer)} +\funcarg{\IN}{datatype}{ data type of elements of send and receive buffers (handle)} +\funcarg{\IN}{op}{ operation (handle)} +\funcarg{\IN}{comm}{ communicator (handle)} +\funcarg{\OUT}{request}{ communication request (handle)} +\end{funcdef} + +\cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}% +\mpibind{MPI\_Ireduce\_scatter\_block(void*~sendbuf, void*~recvbuf, int~recvcount, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_IREDUCE\_SCATTER\_BLOCK(SENDBUF, RECVBUF, RECVCOUNT, DATATYPE, OP, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER RECVCOUNT, DATATYPE, OP, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Ireduce\_scatter\_block(const void*~sendbuf, void*~recvbuf, int~recvcount, const~MPI::Datatype\&~datatype, const~MPI::Op\&~op) const~=~0}{MPI::Request} + + + +This call starts a nonblocking variant of \mpifunc{MPI\_REDUCE\_SCATTER\_BLOCK} +(see Section~\ref{subsec:coll-reduce-scatter-block}). + + + + +\subsection{Nonblocking Reduce-Scatter} +\label{sec:nbcoll-reduce-scatter} + +\begin{funcdef}{MPI\_IREDUCE\_SCATTER(sendbuf, recvbuf, recvcounts, +datatype, op, comm, request)} +\funcarg{\IN}{sendbuf}{ starting address of send buffer (choice)} +\funcarg{\OUT}{recvbuf}{ starting address of receive buffer (choice)} +\funcarg{\IN}{recvcounts}{% +non-negative +integer array specifying the +number of elements in result distributed to each process. +Array must be identical on all calling processes.} +\funcarg{\IN}{datatype}{ data type of elements of input buffer (handle)} +\funcarg{\IN}{op}{ operation (handle)} +\funcarg{\IN}{comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}% +\mpibind{MPI\_Ireduce\_scatter(void*~sendbuf, void*~recvbuf, int~*recvcounts, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_IREDUCE\_SCATTER(SENDBUF, RECVBUF, RECVCOUNTS, DATATYPE, OP, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER RECVCOUNTS(*), DATATYPE, OP, COMM, REQUEST, IERROR} + + +\mpicppemptybind{MPI::Comm::Ireduce\_scatter(const void*~sendbuf, void*~recvbuf, int~recvcounts[], const~MPI::Datatype\&~datatype, const~MPI::Op\&~op) const~=~0}{MPI::Request} + + + +This call starts a nonblocking variant of \mpifunc{MPI\_REDUCE\_SCATTER} +(see Section~\ref{subsec:coll-reduce-scatter}). + + + + + +\subsection{Nonblocking Inclusive Scan} +\label{subsec:nbcoll-iscan} + +\begin{funcdef}{MPI\_ISCAN(sendbuf, recvbuf, count, datatype, op, comm, request)} +\funcarg{\IN}{sendbuf}{starting address of send buffer (choice)} +\funcarg{\OUT}{recvbuf}{ starting address of receive buffer (choice)} +\funcarg{\IN}{count}{ number of elements in input buffer (% +non-negative +integer)} +\funcarg{\IN}{datatype}{ data type of elements of input buffer (handle)} +\funcarg{\IN}{op}{ operation (handle)} +\funcarg{\IN}{comm}{ communicator (handle)} +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}% +\mpibind{MPI\_Iscan(void*~sendbuf, void*~recvbuf, int~count, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm, MPI\_Request~*request)} + +\mpifbind{MPI\_ISCAN(SENDBUF, RECVBUF, COUNT, DATATYPE, OP, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\ INTEGER COUNT, DATATYPE, OP, COMM, REQUEST, IERROR} +\mpicppemptybind{MPI::Intracomm::Iscan(const void*~sendbuf, void*~recvbuf, int~count, const MPI::Datatype\&~datatype, const~MPI::Op\&~op) const}{MPI::Request} + +This call starts a nonblocking variant of \mpifunc{MPI\_SCAN} (see +Section~\ref{sec:coll-scan}). + + + + + + + +\subsection{Nonblocking Exclusive Scan} +\label{subsec:nbcoll-iexscan} + + +\begin{funcdef}{MPI\_IEXSCAN(sendbuf, recvbuf, count, datatype, op, comm, request)} +\funcarg{\IN}{sendbuf}{starting address of send buffer (choice) } +\funcarg{\OUT}{recvbuf}{starting address of receive buffer (choice) } +\funcarg{\IN}{count}{number of elements in input buffer (% +non-negative +integer) } +\funcarg{\IN}{datatype}{data type of elements of input buffer (handle) } +\funcarg{\IN}{op}{operation (handle) } +\funcarg{\IN}{comm}{ intracommunicator (handle) } +\funcarg{\OUT}{request}{communication request (handle)} +\end{funcdef} + +\cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}% +\mpibind{MPI\_Iexscan(void~*sendbuf, void~*recvbuf, int~count, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm, MPI\_Request~*request) } + +\mpifbind{MPI\_IEXSCAN(SENDBUF, RECVBUF, COUNT, DATATYPE, OP, COMM, REQUEST, IERROR) \fargs SENDBUF(*), RECVBUF(*) \\INTEGER COUNT, DATATYPE, OP, COMM, REQUEST, IERROR } + +\mpicppemptybind{MPI::Intracomm::Iexscan(const void* sendbuf, void* recvbuf, int count, const MPI::Datatype\& datatype, const MPI::Op\& op) const}{MPI::Request} + +This call starts a nonblocking variant of \mpifunc{MPI\_EXSCAN} (see +Section~\ref{subsec:coll-exscan}). + + + + + + + + + + + + + + \section{Correctness} \label{coll:correct} @@ -3908,18 +4726,11 @@ \begin{example} {\rm \label{coll-excorr1} -\exindex{Deadlock!with MPI\_Bcast}% -\exindex{MPI\_Bcast}% +\exindex{Deadlock with MPI\_Bcast} +\exindex{MPI\_Bcast} The following is erroneous. -%%HEADER -%%LANG: C -%%FRAGMENT -%%DECL: int rank, count, *buf1, *buf2; MPI_Comm comm; -%%DECL: MPI_Datatype type; -%%SKIPELIPSIS -%%ENDHEADER \begin{verbatim} switch(rank) { case 0: @@ -3946,8 +4757,8 @@ \begin{example} {\rm \label{coll-excorr2} -\exindex{Deadlock!with MPI\_Bcast}% -\exindex{MPI\_Bcast}% +\exindex{Deadlock with MPI\_Bcast} +\exindex{MPI\_Bcast} The following is erroneous. @@ -3985,15 +4796,16 @@ Thus, the code will deadlock. Collective operations must be executed in an order so that -no cyclic dependences occur. +no cyclic dependencies occur. Nonblocking collective operations can +alleviate this issue. } \end{example} \begin{example} {\rm \label{coll-exM} -\exindex{Deadlock!with MPI\_Bcast}% -\exindex{MPI\_Bcast}% +\exindex{Deadlock with MPI\_Bcast} +\exindex{MPI\_Bcast} The following is erroneous. @@ -4036,8 +4848,8 @@ \begin{example} {\rm \label{coll-exN} -\exindex{Non-deterministic program with MPI\_Bcast}% -\exindex{MPI\_Bcast}% +\exindex{Non-deterministic program with MPI\_Bcast} +\exindex{MPI\_Bcast} % A correct, but An unsafe, @@ -4146,3 +4958,331 @@ + + + +\begin{example} {\rm +\label{coll-nbexN} +\exindex{Mixing blocking and nonblocking collective operations} +\exindex{MPI\_Ibarrier} +\exindex{MPI\_Bcast} +\exindex{MPI\_Wait} + +Blocking and nonblocking collective operations can be interleaved, i.e., +a blocking collective operation can be posted even if there is a +nonblocking collective operation outstanding. + +\begin{verbatim} +MPI_Request req; + +MPI_Ibarrier(comm, &req); +MPI_Bcast(buf1, count, type, 0, comm); +MPI_Wait(&req, MPI_STATUS_IGNORE); +\end{verbatim} + +Each process starts a nonblocking barrier operation, participates in a +blocking broadcast and then waits until every other process started the +barrier operation. This effectively turns the broadcast into a +synchronizing broadcast with possible communication/communication +overlap (\mpifunc{MPI\_Bcast} is allowed, but not required to +synchronize). + +} +\end{example} + + +\begin{example} {\rm +\label{coll-nbexN} +\exindex{False matching of collective operations} +\exindex{MPI\_Ibarrier} +\exindex{MPI\_Bcast} +\exindex{MPI\_Wait} + +The starting order of collective operations on a particular communicator +defines their matching. The following example shows an erroneous +matching of different collective operations on the same communicator. + +\begin{verbatim} +MPI_Request req; +switch(rank) { + case 0: + /* erroneous matching */ + MPI_Ibarrier(comm, &req); + MPI_Bcast(buf1, count, type, 0, comm); + MPI_Wait(&req, MPI_STATUS_IGNORE); + break; + case 1: + /* erroneous matching */ + MPI_Bcast(buf1, count, type, 0, comm); + MPI_Ibarrier(comm, &req); + MPI_Wait(&req, MPI_STATUS_IGNORE); + break; +} +\end{verbatim} + +This ordering would match \mpifunc{MPI\_Ibarrier} on rank 0 with +\mpifunc{MPI\_Bcast} on rank 1 which is erroneous and the program behavior is +undefined. However, if such an order is required, the user must create +different duplicate communicators and perform the operations on them. +If started with two processes, the following program would be legal: + +\begin{verbatim} +MPI_Request req; +MPI_Comm dupcomm; +MPI_Comm_dup(comm, &dupcomm); +switch(rank) { + case 0: + MPI_Ibarrier(comm, &req); + MPI_Bcast(buf1, count, type, 0, dupcomm); + MPI_Wait(&req, MPI_STATUS_IGNORE); + break; + case 1: + MPI_Bcast(buf1, count, type, 0, dupcomm); + MPI_Ibarrier(comm, &req); + MPI_Wait(&req, MPI_STATUS_IGNORE); + break; +} +\end{verbatim} + +\begin{users} +The use of different communicators offers some flexibility regarding the +matching of nonblocking collective operations. In this sense, +communicators could be used as an equivalent to tags. However, +communicator construction might induce overheads so that this should be +used carefully. +\end{users} + +} +\end{example} + + +\begin{example} {\rm +\label{coll-nbexN} +\exindex{Progression of nonblocking collective operations} +\exindex{MPI\_Ibarrier} +\exindex{MPI\_Send} +\exindex{MPI\_Recv} +\exindex{MPI\_Wait} + +Nonblocking collective operations can rely on the same progression rules +as nonblocking point-to-point messages. Thus, if started with two +processes, the following program is a valid MPI program and is +guaranteed to terminate: + +\begin{verbatim} +MPI_Request req; + +switch(rank) { + case 0: + MPI_Ibarrier(comm, &req); + MPI_Wait(&req, MPI_STATUS_IGNORE); + MPI_Send(buf, count, dtype, 1, tag, comm); + break; + case 1: + MPI_Ibarrier(comm, &req); + MPI_Recv(buf, count, dtype, 0, tag, comm, MPI_STATUS_IGNORE); + MPI_Wait(&req, MPI_STATUS_IGNORE); + break; +} +\end{verbatim} + +The MPI library must progress the barrier in the \mpifunc{MPI\_Recv} +call. Thus, the \mpifunc{MPI\_Wait} call in rank 0 will eventually +complete, which enables the matching \mpifunc{MPI\_Send} so all calls +eventually return. + + + + +} +\end{example} + +\begin{example} {\rm +\label{coll-nbexN} +\exindex{No Matching of Blocking and Nonblocking collective operations} +\exindex{MPI\_Ialltoall} +\exindex{MPI\_Alltoall} +\exindex{MPI\_Wait} + +Blocking and nonblocking collective operations do not match. The +following example is illegal. + +\begin{verbatim} +MPI_Request req; + +switch(rank) { + case 0: + /* illegal false matching of Alltoall and Ialltoall */ + MPI_Ialltoall(sbuf, scnt, stype, rbuf, rcnt, rtype, comm, &req); + MPI_Wait(&req, MPI_STATUS_IGNORE); + break; + case 1: + /* illegal false matching of Alltoall and Ialltoall */ + MPI_Alltoall(sbuf, scnt, stype, rbuf, rcnt, rtype, comm); + break; +} +\end{verbatim} + +} +\end{example} + + + +\begin{example} {\rm +\label{coll-nbexN} +\exindex{Mixing collective and point-to-point requests} +\exindex{MPI\_Ibarrier} +\exindex{MPI\_Send} +\exindex{MPI\_Irecv} +\exindex{MPI\_Waitall} +\exindex{MPI\_Wait} + +Collective and point-to-point requests can be mixed in functions that +enable multiple completions. If started with two processes, the +following program is valid. + +\begin{verbatim} +MPI_Request reqs[2]; + +switch(rank) { + case 0: + MPI_Ibarrier(comm, &reqs[0]); + MPI_Send(buf, count, dtype, 1, tag, comm); + MPI_Wait(&reqs[0], MPI_STATUS_IGNORE); + break; + case 1: + MPI_Irecv(buf, count, dtype, 0, tag, comm, &reqs[0]); + MPI_Ibarrier(comm, &reqs[1]); + MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE); + break; +} +\end{verbatim} + +The Waitall call returns only after the barrier and the receive completed. + + +} +\end{example} + +\begin{example} {\rm +\label{coll-nbexN} +\exindex{Pipelining nonblocking collective operations} +\exindex{MPI\_Ibcast} +\exindex{MPI\_Waitall} + +Multiple nonblocking collective operations can be outstanding on a +single communicator and match in order. + +\begin{verbatim} +MPI_Request reqs[3]; + +compute(buf1); +MPI_Ibcast(buf1, count, type, 0, comm, &reqs[0]); +compute(buf2); +MPI_Ibcast(buf2, count, type, 0, comm, &reqs[1]); +compute(buf3); +MPI_Ibcast(buf3, count, type, 0, comm, &reqs[2]); +MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE); +\end{verbatim} + +\begin{users} +Pipelining and double-buffering techniques can efficiently be used to +overlap computation and communication. However, having too many +outstanding requests might have a negative impact on performance. +\end{users} + +\begin{implementors} +The use of pipelining may generate many outstanding requests. A +high-quality hardware-supported implementation with limited resources +should be able to fall back to a software implementation if its +resources are exhausted. In this way, the implementation could limit the +number of outstanding requests only by the available memory. +\end{implementors} + +} +\end{example} + + +\begin{example} {\rm +\label{coll-nbexN} +\exindex{Overlapping Communicators} +\exindex{MPI\_Iallreduce} +\exindex{MPI\_Waitall} + +Nonblocking collective operations can also be used to enable +simultaneous collective operations on multiple overlapping +communicators (see Figure~\ref{overlap_comms}). The following example is started with three processes and +three communicators. The first communicator \verb~comm1~ includes ranks 0 and +1, \verb~comm2~ includes ranks 1 and 2 and \verb~comm3~ spans ranks 0 and 2. It is not +possible to perform a blocking collective operation on all communicators +because there exists no deadlock-free order to invoke them. However, +nonblocking collective operations can easily be used to achieve this +task. + +\begin{verbatim} +MPI_Request reqs[2]; + +switch(rank) { + case 0: + MPI_Iallreduce(sbuf1, rbuf1, count, dtype, MPI_SUM, comm1, &reqs[0]); + MPI_Iallreduce(sbuf3, rbuf3, count, dtype, MPI_SUM, comm3, &reqs[1]); + break; + case 1: + MPI_Iallreduce(sbuf1, rbuf1, count, dtype, MPI_SUM, comm1, &reqs[0]); + MPI_Iallreduce(sbuf2, rbuf2, count, dtype, MPI_SUM, comm2, &reqs[1]); + break; + case 2: + MPI_Iallreduce(sbuf2, rbuf2, count, dtype, MPI_SUM, comm2, &reqs[0]); + MPI_Iallreduce(sbuf3, rbuf3, count, dtype, MPI_SUM, comm3, &reqs[1]); + break; +} +MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE); +\end{verbatim} + +\begin{users} +This method can be useful if overlapping neighboring regions (halo +or ghost zones) are used in collective operations. The sequence of the +two calls in each process is irrelevant because the two nonblocking +operations are performed on different communicators. +\end{users} + +\begin{figure} + \center + \includegraphics[width=2.50in]{figures/overlap_comms} + \small + \caption[Overlapping Communicators Example]{Example with overlapping + communicators.} + \label{overlap_comms} +\end{figure} + + +} +\end{example} + +\begin{example} {\rm +\label{coll-nbexN} +\exindex{Independence of nonblocking operations} +\exindex{MPI\_Ibcast} + +The progress of multiple outstanding nonblocking collective operations +is completely independent. + +\begin{verbatim} +MPI_Request reqs[2]; + +compute(buf1); +MPI_Ibcast(buf1, count, type, 0, comm, &reqs[0]); +compute(buf2); +MPI_Ibcast(buf2, count, type, 0, comm, &reqs[1]); +MPI_Wait(&reqs[1], MPI_STATUS_IGNORE); +/* nothing is known about the status of the first bcast here */ +MPI_Wait(&reqs[0], MPI_STATUS_IGNORE); +\end{verbatim} + +Finishing the second \mpifunc{MPI\_IBCAST} is completely independent of +the first one. This means that it is not guaranteed that the first +broadcast operation is finished or even started after the second one is +completed via \verb!reqs[1]!. + +} +\end{example}