Index: coll.tex
===================================================================
--- coll.tex	(revision 495)
+++ coll.tex	(revision 549)
@@ -16,67 +16,77 @@
 The functions of this type provided by \MPI/ are the following:
 \begin{itemize}
 \item
-\mpifunc{MPI\_BARRIER}:
+\mpifunc{MPI\_BARRIER}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IBARRIER}}:
 Barrier synchronization across
 %all group members
 all members of a group
-(Section~\ref{sec:coll-barrier}).
+(Section~\ref{sec:coll-barrier}\MPIreplace{3.0}{109}{}{ and Section~\ref{sec:nbcoll-ibarrier}}).
 \item
-\mpifunc{MPI\_BCAST}:
+\mpifunc{MPI\_BCAST}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IBCAST}}:
 Broadcast from one member to all members of a group
-(Section~\ref{sec:coll-broadcast}).
+(Section~\ref{sec:coll-broadcast}\MPIreplace{3.0}{109}{}{ and Section~\ref{sec:nbcoll-ibroadcast}}).
 This is shown
 as ``broadcast''
 in Figure~\ref{fig:collcom}.
 \item
-\mpifunc{MPI\_GATHER}, \mpifunc{MPI\_GATHERV}:
+\mpifunc{MPI\_GATHER}\MPIreplace{3.0}{109}{}{, MPI\_IGATHER},
+\mpifunc{MPI\_GATHERV}, \MPIreplace{3.0}{109}{}{MPI\_IGATHERV}:
 Gather data from
 %all group members to one member
 all members of a group
 to one member
-(Section~\ref{sec:coll-gather}).
+(Section~\ref{sec:coll-gather}\MPIreplace{3.0}{109}{}{ and Section~\ref{sec:nbcoll-igather}}).
 This is shown
 as ``gather''
 in Figure~\ref{fig:collcom}.
 \item
-\mpifunc{MPI\_SCATTER}, \mpifunc{MPI\_SCATTERV}:
+\mpifunc{MPI\_SCATTER}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_ISCATTER}},
+\mpifunc{MPI\_SCATTERV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_ISCATTERV}}:
 Scatter data from one member to all members of a group
-(Section~\ref{sec:coll-scatter}).
+(Section~\ref{sec:coll-scatter}\MPIreplace{3.0}{109}{}{ and Section \ref{sec:nbcoll-iscatter}}).
 This is shown
 as ``scatter''
 in Figure~\ref{fig:collcom}.
 \item
-\mpifunc{MPI\_ALLGATHER}, \mpifunc{MPI\_ALLGATHERV}:
+\mpifunc{MPI\_ALLGATHER}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLGATHER}}, 
+\mpifunc{MPI\_ALLGATHERV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLGATHERV}}:
 A variation on Gather where all members of
 %the
 a
 group receive the result
-(Section~\ref {sec:coll-allcast}).
+(Section~\ref{sec:coll-allcast}\MPIreplace{3.0}{109}{}{ and Section~\ref{sec:nbcoll-iallcast}}).
 This is shown as ``allgather'' in Figure~\ref{fig:collcom}.
 \item
-\mpifunc{MPI\_ALLTOALL}, \mpifunc{MPI\_ALLTOALLV}, \mpifunc{MPI\_ALLTOALLW}:
+\mpifunc{MPI\_ALLTOALL}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLTOALL}}, 
+\mpifunc{MPI\_ALLTOALLV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLTOALLV}}, 
+\mpifunc{MPI\_ALLTOALLW}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLTOALLW}}:
 Scatter/Gather data from all members to all members of a group
 (also called complete exchange)
-(Section~\ref{sec:coll-alltoall}).
+(Section~\ref{sec:coll-alltoall}\MPIreplace{3.0}{109}{}{ and Section~\ref{sec:nbcoll-ialltoall}}).
 This is shown as ``complete exchange'' in Figure~\ref{fig:collcom}.
 \item
-\mpifunc{MPI\_ALLREDUCE}, \mpifunc{MPI\_REDUCE}:
+\mpifunc{MPI\_ALLREDUCE}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLREDUCE}},
+\mpifunc{MPI\_REDUCE}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IREDUCE}}:
 Global reduction operations such as sum, max, min, or user-defined functions,
 where the result
 is returned to
 %all group members
 all members of a group
+\MPIreplace{3.0}{109}{}{(Section~\ref{subsec:coll-all-reduce} and Section~\ref{subsec:nbcoll-all-reduce}) }
 and a variation where the result is
 returned to only one member
-(Section~\ref{global-reduce}).
+(Section~\ref{global-reduce}\MPIreplace{3.0}{109}{}{ and Section~\ref{subsec:nbcoll-ireduce}}).
 \item
-\mpifunc{MPI\_REDUCE\_SCATTER}:
+\mpifunc{MPI\_REDUCE\_SCATTER}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IREDUCE\_SCATTER}, \mpifunc{MPI\_REDUCE\_SCATTER\_BLOCK}, \mpifunc{MPI\_IREDUCE\_SCATTER\_BLOCK}}:
 A combined reduction and scatter operation
-(Section~\ref{sec:coll-reduce-scatter}).
+(Section~\ref{sec:coll-reduce-scatter}\MPIreplace{3.0}{109}{}{ and Section~\ref{sec:nbcoll-reduce-scatter}}).
 \item
-\mpifunc{MPI\_SCAN}, \mpifunc{MPI\_EXSCAN}:
+\mpifunc{MPI\_SCAN}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_ISCAN}}, 
+\mpifunc{MPI\_EXSCAN}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IEXSCAN}}:
 Scan across all members of a group (also called prefix)
-(Section~\ref{sec:coll-scan}).
+(Section~\ref{sec:coll-scan}\MPIreplace{3.0}{109}{}{,
+Section~\ref{subsec:coll-exscan}, Section~\ref{subsec:nbcoll-iscan}, and
+Section~\ref{subsec:nbcoll-iexscan}}).
 \end{itemize}
 
 \begin{figure}
@@ -128,9 +138,16 @@
 type maps (the layout in memory, see Section~\ref{sec:pt2pt-datatype})
 between sender and receiver are still allowed.
 
-Collective routine calls can (but are not required to) return as soon as their
-participation in the collective communication is complete.  The completion
-of a call indicates that the caller is now free to modify locations in the
+Collective \MPIreplace{3.0}{109}{routine calls}{operations} can (but are not required to)
+\MPIreplace{3.0}{109}{return}{complete} as soon as \MPIreplace{3.0}{109}{their}{the
+caller's}
+participation in the collective communication is
+\MPIreplace{3.0}{109}{complete}{finished}.  \MPIreplace{3.0}{109}{}{A blocking operation is
+complete as soon as the call returns. A nonblocking (immediate) call
+requires a separate completion call (cf. Section~\ref{sec:pt2pt-nonblock}).}
+The completion
+of a \MPIreplace{3.0}{109}{call}{collective operation} indicates that the caller
+is \MPIreplace{3.0}{109}{now}{} free to modify locations in the
 communication buffer.  It does not indicate that other processes in
 the group have completed or even
 started the operation (unless otherwise
@@ -138,14 +155,19 @@
 implied by %
 %
 the description of the operation).
-Thus, a collective communication call may, or
+% htor: the MPIreplace macro causes some LaTeX problems here :-/
+\MPIreplace{3.0}{109}{Thus, a collective communication call may, or
 may not, have the effect of synchronizing all calling processes.
-This statement excludes, of course, the barrier function.
+This statement excludes, of course, the barrier
+function}{Thus, a collective communication function may, or may not,
+have the effect of synchronizing all calling processes. This statement
+excludes, of course, the barrier operation}.
 
 Collective communication calls may use the same
 communicators as point-to-point communication; \MPI/ guarantees that
 messages generated on behalf of collective communication calls will not
 be confused with messages generated by point-to-point communication.
+\MPIreplace{3.0}{109}{}{The collective operations do not have a message tag argument.}
 A more detailed discussion of correct use of collective
 routines is found in Section~\ref {coll:correct}.
 
@@ -159,13 +181,13 @@
 The statements about synchronization are made so as to allow a variety
 of implementations of the collective functions.
 
-The collective operations do not accept a message tag argument.
+\MPIreplace{3.0}{109}{The collective operations do not accept a message tag argument.
 If future revisions of \MPI/ define nonblocking collective functions,
 then tags (or a similar mechanism) 
 % will 
 might 
 need to be added so as
-to allow the dis-ambiguation of multiple, pending, collective operations.
+to allow the dis-ambiguation of multiple, pending, collective operations.}{}
 \end{rationale}
 
 \begin{users}
@@ -221,7 +243,7 @@
 Groups and communicators are discussed in full detail in Chapter ~\ref{chap:context}.
 For the purposes of this chapter, it is sufficient to know that there
 are two types of communicators: {\em intra-communicators} and {\em inter-communicators}.
-An intracommunicator can be thought of as an indentifier for a single group of processes
+An intracommunicator can be thought of as an i\MPIreplace{3.0}{109}{n}{}dentifier for a single group of processes
 linked with a context.  An intercommunicator identifies two distinct groups of processes
 linked with a context.
 
@@ -260,9 +282,9 @@
 Note that \constskip{MPI\_IN\_PLACE} is a special kind of value; it has the
 same 
 restrictions on its use that \consti{MPI\_BOTTOM} has.
-
+\MPIreplace{3.0}{109}{
 Some intracommunicator collective operations do not support the ``in place''
-option (e.g., \mpifunci{MPI\_ALLTOALLV}).
+option (e.g., \mpifunci{MPI\_ALLTOALLV}).}{}
 \end{users}
 
 %\discuss{Does anyone know if the INTENT problem can be fixed by telling a
@@ -283,33 +305,42 @@
 \item[All-To-All] All processes contribute to the result.  All processes
   receive the result.  
   \begin{itemize}
-  \item \mpifunc{MPI\_ALLGATHER}, \mpifunc{MPI\_ALLGATHERV}
-  \item \mpifunc{MPI\_ALLTOALL}, \mpifunc{MPI\_ALLTOALLV},
-        \mpifunc{MPI\_ALLTOALLW}
-  \item \mpifunc{MPI\_ALLREDUCE}, \mpifunc{MPI\_REDUCE\_SCATTER}
-  \item \mpifunc{MPI\_BARRIER}
+  \item \mpifunc{MPI\_ALLGATHER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLGATHER},}  
+  \mpifunc{MPI\_ALLGATHERV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLGATHERV}}
+  \item \mpifunc{MPI\_ALLTOALL},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLTOALL},} 
+  \mpifunc{MPI\_ALLTOALLV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLTOALLV}},
+  \mpifunc{MPI\_ALLTOALLW}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IALLTOALLW}}
+  \item \mpifunc{MPI\_ALLREDUCE},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLREDUCE}, } 
+  \mpifunc{MPI\_REDUCE\_SCATTER}\MPIreplace{3.0}{109}{}{,
+  \mpifunc{MPI\_IREDUCE\_SCATTER},
+  \mpifunc{MPI\_REDUCE\_SCATTER\_BLOCK},
+  \mpifunc{MPI\_IREDUCE\_SCATTER\_BLOCK}}
+  \item \mpifunc{MPI\_BARRIER}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IBARRIER}} 
   \end{itemize}
 \item[All-To-One] All processes contribute to the result.  One process
   receives the result.
   \begin{itemize}
-  \item \mpifunc{MPI\_GATHER}, \mpifunc{MPI\_GATHERV}
-  \item \mpifunc{MPI\_REDUCE}
+  \item \mpifunc{MPI\_GATHER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IGATHER},} 
+  \mpifunc{MPI\_GATHERV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IGATHERV}}
+  \item \mpifunc{MPI\_REDUCE}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IREDUCE}}
   \end{itemize}
 \item[One-To-All] One process contributes to the result.  All processes
   receive the result.
   \begin{itemize}
-  \item \mpifunc{MPI\_BCAST}
-  \item \mpifunc{MPI\_SCATTER}, \mpifunc{MPI\_SCATTERV}
+  \item \mpifunc{MPI\_BCAST}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IBCAST}}
+  \item \mpifunc{MPI\_SCATTER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_ISCATTER}, } 
+  \mpifunc{MPI\_SCATTERV}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_ISCATTERV}}
   \end{itemize}
 \item[Other] Collective operations that do not fit into one of the above
   categories.
   \begin{itemize}
   %\item \mpifunc{MPI\_SCAN}
-  \item \mpifunc{MPI\_SCAN}, \mpifunc{MPI\_EXSCAN}
+  \item \mpifunc{MPI\_SCAN}, \MPIreplace{3.0}{109}{}{\mpifunc{MPI\_ISCAN}, } 
+  \mpifunc{MPI\_EXSCAN}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_IEXSCAN}}
   \end{itemize}
 \end{description}
-The data movement patterns of \mpifunc{MPI\_SCAN} 
-and \mpifunc{MPI\_EXSCAN}
+The data movement patterns of \mpifunc{MPI\_SCAN}\MPIreplace{3.0}{109}{}{, \mpifunc{MPI\_ISCAN}} 
+\MPIreplace{3.0}{109}{and}{,} \mpifunc{MPI\_EXSCAN}\MPIreplace{3.0}{109}{}{, and \mpifunc{MPI\_IEXSCAN}}
 do not fit this taxonomy.
 
 %%%%%%%%%%%%%%%%%%
@@ -343,14 +374,22 @@
 % apply to intercommunicators:
 The following collective operations also apply to intercommunicators:
 \begin{itemize}
-\item \mpifunc{MPI\_BARRIER},
-\item \mpifunc{MPI\_BCAST},
-\item \mpifunc{MPI\_GATHER}, \mpifunc{MPI\_GATHERV},
-\item \mpifunc{MPI\_SCATTER}, \mpifunc{MPI\_SCATTERV},
-\item \mpifunc{MPI\_ALLGATHER}, \mpifunc{MPI\_ALLGATHERV},
-\item \mpifunc{MPI\_ALLTOALL}, \mpifunc{MPI\_ALLTOALLV}, \mpifunc{MPI\_ALLTOALLW},
-\item \mpifunc{MPI\_ALLREDUCE}, \mpifunc{MPI\_REDUCE},
-\item \mpifunc{MPI\_REDUCE\_SCATTER}.
+\item \mpifunc{MPI\_BARRIER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IBARRIER}}
+\item \mpifunc{MPI\_BCAST},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IBCAST}}
+\item \mpifunc{MPI\_GATHER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IGATHER},} 
+      \mpifunc{MPI\_GATHERV},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IGATHERV},}
+\item \mpifunc{MPI\_SCATTER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_ISCATTER},} 
+      \mpifunc{MPI\_SCATTERV},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_ISCATTERV},}
+\item \mpifunc{MPI\_ALLGATHER},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLGATHER},} 
+      \mpifunc{MPI\_ALLGATHERV},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLGATHERV},}
+\item \mpifunc{MPI\_ALLTOALL},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLTOALL},} 
+      \mpifunc{MPI\_ALLTOALLV},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLTOALLV},} 
+      \mpifunc{MPI\_ALLTOALLW},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLTOALLW},}
+\item \mpifunc{MPI\_ALLREDUCE},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IALLREDUCE},} 
+      \mpifunc{MPI\_REDUCE},\MPIreplace{3.0}{109}{}{ \mpifunc{MPI\_IREDUCE},}
+\item \mpifunc{MPI\_REDUCE\_SCATTER}\MPIreplace{3.0}{109}{}{,
+\mpifunc{MPI\_IREDUCE\_SCATTER}, \mpifunc{MPI\_REDUCE\_SCATTER\_BLOCK},
+\mpifunc{MPI\_IREDUCE\_SCATTER\_BLOCK}}.
 \end{itemize}
 In C++, the bindings for these functions are in the \ctype{MPI::Comm} class.
 %But
@@ -500,7 +539,7 @@
 \section{Barrier Synchronization}
 \label{sec:coll-barrier}
 
-\begin{funcdef}{MPI\_BARRIER( comm )}
+\begin{funcdef}{MPI\_BARRIER(comm)}
 \funcarg{\IN}{comm}{communicator (handle)}
 \end{funcdef}
 
@@ -532,7 +571,7 @@
 \section{Broadcast}
 \label{sec:coll-broadcast}
 
-\begin{funcdef}{MPI\_BCAST( buffer, count, datatype, root, comm )}
+\begin{funcdef}{MPI\_BCAST(buffer, count, datatype, root, comm)}
 \funcarg{\INOUT}{ buffer}{starting address of buffer (choice)}
 \funcarg{\IN}{ count}{ number of entries in buffer (%
 non-negative
@@ -542,7 +581,7 @@
 \funcarg{\IN}{ comm}{ communicator (handle)}
 \end{funcdef}
 
-\mpibind{MPI\_Bcast(void*~buffer, int~count, MPI\_Datatype~datatype, int~root, MPI\_Comm~comm )}
+\mpibind{MPI\_Bcast(void*~buffer, int~count, MPI\_Datatype~datatype, int~root, MPI\_Comm~comm)}
 
 \mpifbind{MPI\_BCAST(BUFFER, COUNT, DATATYPE, ROOT, COMM, IERROR) \fargs <type>  BUFFER(*) \\ INTEGER COUNT, DATATYPE, ROOT, COMM, IERROR}
 % changed in MPI-2 %% \mpicppemptybind{MPI::Intracomm::Bcast(void*~buffer, int~count, const~MPI::Datatype\&~datatype, int~root) const}{void}
@@ -611,7 +650,7 @@
     int array[100];
     int root=0;
     ...
-    MPI_Bcast( array, 100, MPI_INT, root, comm);
+    MPI_Bcast(array, 100, MPI_INT, root, comm);
 \end{verbatim}
 As in many of our example code fragments, we assume that some
 of the variables (such as {\tt comm} in the above) have been assigned
@@ -621,7 +660,7 @@
 \section{Gather}
 \label{sec:coll-gather}
 
-\begin{funcdef}{MPI\_GATHER( sendbuf, sendcount, sendtype, recvbuf,
+\begin{funcdef}{MPI\_GATHER(sendbuf, sendcount, sendtype, recvbuf,
 recvcount, recvtype, root, comm) }
 \funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)}
 \funcarg{\IN}{ sendcount}{ number of elements in send buffer (%
@@ -719,7 +758,7 @@
 buffer arguments of the processes in group B must be consistent with
 the receive buffer argument of the root.
 
-\begin{funcdef}{MPI\_GATHERV( sendbuf, sendcount, sendtype, recvbuf,
+\begin{funcdef}{MPI\_GATHERV(sendbuf, sendcount, sendtype, recvbuf,
 recvcounts, displs, recvtype, root, comm) }
 \funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)}
 \funcarg{\IN}{ sendcount}{ number of elements in send buffer (%
@@ -839,9 +878,9 @@
     int gsize,sendarray[100];
     int root, *rbuf;
     ...
-    MPI_Comm_size( comm, &gsize);
+    MPI_Comm_size(comm, &gsize);
     rbuf = (int *)malloc(gsize*100*sizeof(int));
-    MPI_Gather( sendarray, 100, MPI_INT, rbuf, 100, MPI_INT, root, comm);
+    MPI_Gather(sendarray, 100, MPI_INT, rbuf, 100, MPI_INT, root, comm);
 \end{verbatim}
 } \end{example}
 
@@ -865,12 +904,12 @@
     int gsize,sendarray[100];
     int root, myrank, *rbuf;
     ...
-    MPI_Comm_rank( comm, &myrank);
-    if ( myrank == root) {
-       MPI_Comm_size( comm, &gsize);
+    MPI_Comm_rank(comm, &myrank);
+    if (myrank == root) {
+       MPI_Comm_size(comm, &gsize);
        rbuf = (int *)malloc(gsize*100*sizeof(int));
     }
-    MPI_Gather( sendarray, 100, MPI_INT, rbuf, 100, MPI_INT, root, comm);
+    MPI_Gather(sendarray, 100, MPI_INT, rbuf, 100, MPI_INT, root, comm);
 \end{verbatim}
 } \end{example}
 
@@ -916,11 +955,11 @@
     int root, *rbuf;
     MPI_Datatype rtype;
     ...
-    MPI_Comm_size( comm, &gsize);
-    MPI_Type_contiguous( 100, MPI_INT, &rtype );
-    MPI_Type_commit( &rtype );
+    MPI_Comm_size(comm, &gsize);
+    MPI_Type_contiguous(100, MPI_INT, &rtype);
+    MPI_Type_commit(&rtype);
     rbuf = (int *)malloc(gsize*100*sizeof(int));
-    MPI_Gather( sendarray, 100, MPI_INT, rbuf, 1, rtype, root, comm);
+    MPI_Gather(sendarray, 100, MPI_INT, rbuf, 1, rtype, root, comm);
 \end{verbatim}
 } \end{example}
 
@@ -947,7 +986,7 @@
 
     ...
 
-    MPI_Comm_size( comm, &gsize);
+    MPI_Comm_size(comm, &gsize);
     rbuf = (int *)malloc(gsize*stride*sizeof(int));
     displs = (int *)malloc(gsize*sizeof(int));
     rcounts = (int *)malloc(gsize*sizeof(int));
@@ -955,7 +994,7 @@
         displs[i] = i*stride;
         rcounts[i] = 100;
     }
-    MPI_Gatherv( sendarray, 100, MPI_INT, rbuf, rcounts, displs, MPI_INT,
+    MPI_Gatherv(sendarray, 100, MPI_INT, rbuf, rcounts, displs, MPI_INT,
                                                                root, comm);
 \end{verbatim}
 
@@ -1006,7 +1045,7 @@
 
     ...
 
-    MPI_Comm_size( comm, &gsize);
+    MPI_Comm_size(comm, &gsize);
     rbuf = (int *)malloc(gsize*stride*sizeof(int));
     displs = (int *)malloc(gsize*sizeof(int));
     rcounts = (int *)malloc(gsize*sizeof(int));
@@ -1016,9 +1055,9 @@
     }
     /* Create datatype for 1 column of array
      */
-    MPI_Type_vector( 100, 1, 150, MPI_INT, &stype);
-    MPI_Type_commit( &stype );
-    MPI_Gatherv( sendarray, 1, stype, rbuf, rcounts, displs, MPI_INT,
+    MPI_Type_vector(100, 1, 150, MPI_INT, &stype);
+    MPI_Type_commit(&stype);
+    MPI_Gatherv(sendarray, 1, stype, rbuf, rcounts, displs, MPI_INT,
                                                              root, comm);
 \end{verbatim}
 } \end{example}
@@ -1067,8 +1106,8 @@
 
     ...
 
-    MPI_Comm_size( comm, &gsize);
-    MPI_Comm_rank( comm, &myrank );
+    MPI_Comm_size(comm, &gsize);
+    MPI_Comm_rank(comm, &myrank);
     rbuf = (int *)malloc(gsize*stride*sizeof(int));
     displs = (int *)malloc(gsize*sizeof(int));
     rcounts = (int *)malloc(gsize*sizeof(int));
@@ -1078,12 +1117,12 @@
     }
     /* Create datatype for the column we are sending
      */
-    MPI_Type_vector( 100-myrank, 1, 150, MPI_INT, &stype);
-    MPI_Type_commit( &stype );
+    MPI_Type_vector(100-myrank, 1, 150, MPI_INT, &stype);
+    MPI_Type_commit(&stype);
     /* sptr is the address of start of "myrank" column
      */
     sptr = &sendarray[0][myrank];
-    MPI_Gatherv( sptr, 1, stype, rbuf, rcounts, displs, MPI_INT,
+    MPI_Gatherv(sptr, 1, stype, rbuf, rcounts, displs, MPI_INT,
                                                         root, comm);
 \end{verbatim}
 
@@ -1142,8 +1181,8 @@
 
     ...
 
-    MPI_Comm_size( comm, &gsize);
-    MPI_Comm_rank( comm, &myrank );
+    MPI_Comm_size(comm, &gsize);
+    MPI_Comm_rank(comm, &myrank);
     rbuf = (int *)malloc(gsize*stride*sizeof(int));
     displs = (int *)malloc(gsize*sizeof(int));
     rcounts = (int *)malloc(gsize*sizeof(int));
@@ -1156,10 +1195,10 @@
     disp[0] = 0;       disp[1] = 150*sizeof(int);
     type[0] = MPI_INT; type[1] = MPI_UB;
     blocklen[0] = 1;   blocklen[1] = 1;
-    MPI_Type_create_struct( 2, blocklen, disp, type, &stype );
-    MPI_Type_commit( &stype );
+    MPI_Type_create_struct(2, blocklen, disp, type, &stype);
+    MPI_Type_commit(&stype);
     sptr = &sendarray[0][myrank];
-    MPI_Gatherv( sptr, 100-myrank, stype, rbuf, rcounts, displs, MPI_INT,
+    MPI_Gatherv(sptr, 100-myrank, stype, rbuf, rcounts, displs, MPI_INT,
                                                                root, comm);
 \end{Verbatim}
 } \end{example}
@@ -1189,8 +1228,8 @@
 
     ...
 
-    MPI_Comm_size( comm, &gsize);
-    MPI_Comm_rank( comm, &myrank );
+    MPI_Comm_size(comm, &gsize);
+    MPI_Comm_rank(comm, &myrank);
 
     stride = (int *)malloc(gsize*sizeof(int));
     ...
@@ -1213,10 +1252,10 @@
     rbuf = (int *)malloc(bufsize*sizeof(int));
     /* Create datatype for the column we are sending
      */
-    MPI_Type_vector( 100-myrank, 1, 150, MPI_INT, &stype);
-    MPI_Type_commit( &stype );
+    MPI_Type_vector(100-myrank, 1, 150, MPI_INT, &stype);
+    MPI_Type_commit(&stype);
     sptr = &sendarray[0][myrank];
-    MPI_Gatherv( sptr, 1, stype, rbuf, rcounts, displs, MPI_INT,
+    MPI_Gatherv(sptr, 1, stype, rbuf, rcounts, displs, MPI_INT,
                                                         root, comm);
 \end{verbatim}
 } \end{example}
@@ -1274,13 +1313,13 @@
 
     ...
 
-    MPI_Comm_size( comm, &gsize);
-    MPI_Comm_rank( comm, &myrank );
+    MPI_Comm_size(comm, &gsize);
+    MPI_Comm_rank(comm, &myrank);
 
     /* First, gather nums to root
      */
     rcounts = (int *)malloc(gsize*sizeof(int));
-    MPI_Gather( &num, 1, MPI_INT, rcounts, 1, MPI_INT, root, comm);
+    MPI_Gather(&num, 1, MPI_INT, rcounts, 1, MPI_INT, root, comm);
     /* root now has correct rcounts, using these we set displs[] so
      * that data is placed contiguously (or concatenated) at receive end
      */
@@ -1299,9 +1338,9 @@
     type[0] = MPI_INT; type[1] = MPI_UB;
     blocklen[0] = 1;   blocklen[1] = 1;
     MPI_Type_create_struct( 2, blocklen, disp, type, &stype );
-    MPI_Type_commit( &stype );
+    MPI_Type_commit(&stype);
     sptr = &sendarray[0][myrank];
-    MPI_Gatherv( sptr, num, stype, rbuf, rcounts, displs, MPI_INT,
+    MPI_Gatherv(sptr, num, stype, rbuf, rcounts, displs, MPI_INT,
                                                                root, comm);
 \end{Verbatim}
 } \end{example}
@@ -1309,7 +1348,7 @@
 \section{Scatter}
 \label{sec:coll-scatter}
 
-\begin{funcdef}{MPI\_SCATTER( sendbuf, sendcount, sendtype, recvbuf,
+\begin{funcdef}{MPI\_SCATTER(sendbuf, sendcount, sendtype, recvbuf,
 recvcount, recvtype, root, comm)}
 \funcarg{\IN}{ sendbuf}{ address of send buffer (choice, significant
 only at root)}
@@ -1411,7 +1450,7 @@
 must be consistent with the send buffer argument of the root.
 
 
-\begin{funcdef}{MPI\_SCATTERV( sendbuf, sendcounts, displs, sendtype,
+\begin{funcdef}{MPI\_SCATTERV(sendbuf, sendcounts, displs, sendtype,
 recvbuf, recvcount, recvtype, root, comm)}
 \funcarg{\IN}{ sendbuf}{ address of send buffer (choice, significant
 only at root)}
@@ -1420,7 +1459,7 @@
 integer array (of length group size)
 specifying the number of elements to send to each processor }
 \funcarg{\IN}{ displs}{ integer array (of length group size).  Entry
-{\tt i} specifies the displacement (relative to \mpiarg{sendbuf} from
+{\tt i} specifies the displacement (relative to \mpiarg{sendbuf}\MPIreplace{3.0}{109}{}{)} from
 which to take the outgoing data to process {\tt i}}
 \funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)}
 \funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)}
@@ -1526,10 +1565,10 @@
     int gsize,*sendbuf;
     int root, rbuf[100];
     ...
-    MPI_Comm_size( comm, &gsize);
+    MPI_Comm_size(comm, &gsize);
     sendbuf = (int *)malloc(gsize*100*sizeof(int));
     ...
-    MPI_Scatter( sendbuf, 100, MPI_INT, rbuf, 100, MPI_INT, root, comm);
+    MPI_Scatter(sendbuf, 100, MPI_INT, rbuf, 100, MPI_INT, root, comm);
 \end{verbatim}
 } \end{example}
 
@@ -1576,7 +1615,7 @@
 
     ...
 
-    MPI_Comm_size( comm, &gsize);
+    MPI_Comm_size(comm, &gsize);
     sendbuf = (int *)malloc(gsize*stride*sizeof(int));
     ...
     displs = (int *)malloc(gsize*sizeof(int));
@@ -1585,7 +1624,7 @@
         displs[i] = i*stride;
         scounts[i] = 100;
     }
-    MPI_Scatterv( sendbuf, scounts, displs, MPI_INT, rbuf, 100, MPI_INT,
+    MPI_Scatterv(sendbuf, scounts, displs, MPI_INT, rbuf, 100, MPI_INT,
                                                               root, comm);
 \end{verbatim}
 } \end{example}
@@ -1634,8 +1673,8 @@
     MPI_Datatype rtype;
     int i, *displs, *scounts, offset;
     ...
-    MPI_Comm_size( comm, &gsize);
-    MPI_Comm_rank( comm, &myrank );
+    MPI_Comm_size(comm, &gsize);
+    MPI_Comm_rank(comm, &myrank);
 
     stride = (int *)malloc(gsize*sizeof(int));
     ...
@@ -1653,10 +1692,10 @@
     }
     /* Create datatype for the column we are receiving
      */
-    MPI_Type_vector( 100-myrank, 1, 150, MPI_INT, &rtype);
-    MPI_Type_commit( &rtype );
+    MPI_Type_vector(100-myrank, 1, 150, MPI_INT, &rtype);
+    MPI_Type_commit(&rtype);
     rptr = &recvarray[0][myrank];
-    MPI_Scatterv( sendbuf, scounts, displs, MPI_INT, rptr, 1, rtype,
+    MPI_Scatterv(sendbuf, scounts, displs, MPI_INT, rptr, 1, rtype,
                                                             root, comm);
 
 \end{Verbatim}
@@ -1687,7 +1726,7 @@
 \section{Gather-to-all}
 \label{sec:coll-allcast}
 
-\begin{funcdef}{MPI\_ALLGATHER( sendbuf, sendcount, sendtype, recvbuf,
+\begin{funcdef}{MPI\_ALLGATHER(sendbuf, sendcount, sendtype, recvbuf,
 recvcount, recvtype, comm)}
 \funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)}
 \funcarg{\IN}{ sendcount}{ number of elements in send buffer (%
@@ -1797,7 +1836,7 @@
 \end{users}
 %\discuss{This is an ``in place'' case with replacement.}
 
-\begin{funcdef}{MPI\_ALLGATHERV( sendbuf, sendcount, sendtype, recvbuf,
+\begin{funcdef}{MPI\_ALLGATHERV(sendbuf, sendcount, sendtype, recvbuf,
 recvcounts, displs, recvtype, comm)}
 \funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)}
 \funcarg{\IN}{ sendcount}{ number of elements in send buffer (%
@@ -1908,9 +1947,9 @@
     int gsize,sendarray[100];
     int *rbuf;
     ...
-    MPI_Comm_size( comm, &gsize);
+    MPI_Comm_size(comm, &gsize);
     rbuf = (int *)malloc(gsize*100*sizeof(int));
-    MPI_Allgather( sendarray, 100, MPI_INT, rbuf, 100, MPI_INT, comm);
+    MPI_Allgather(sendarray, 100, MPI_INT, rbuf, 100, MPI_INT, comm);
 \end{verbatim}
 
 After the call, every process has the group-wide concatenation of the
@@ -2036,7 +2075,7 @@
 integer array (of length group size)
 specifying the number of elements to send to each processor}
 \funcarg{\IN}{ sdispls}{ integer array (of length group size).  Entry
-{\tt j} specifies the displacement (relative to \mpiarg{sendbuf} from
+{\tt j} specifies the displacement (relative to \mpiarg{sendbuf}\MPIreplace{3.0}{109}{}{)} from
 which to take the outgoing data destined for process {\tt j}}
 \funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)}
 \funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)}
@@ -2046,7 +2085,7 @@
 specifying the number of elements that can be received from
 each processor}
 \funcarg{\IN}{ rdispls}{ integer array (of length group size).  Entry
-{\tt i} specifies the displacement (relative to \mpiarg{recvbuf} at
+{\tt i} specifies the displacement (relative to \mpiarg{recvbuf}\MPIreplace{3.0}{109}{}{)} at
 which to place the incoming data from process {\tt i}}
 \funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)}
 \funcarg{\IN}{ comm}{ communicator (handle)}
@@ -2375,7 +2414,7 @@
 \subsection{Reduce}
 \label{subsec:coll-reduce}
 
-\begin{funcdef}{MPI\_REDUCE( sendbuf, recvbuf, count, datatype, op,
+\begin{funcdef}{MPI\_REDUCE(sendbuf, recvbuf, count, datatype, op,
 root, comm)}
 \funcarg{\IN}{ sendbuf}{ address of send buffer (choice)}
 \funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice,
@@ -2886,7 +2925,7 @@
         in[i].val = ain[i];
         in[i].rank = myrank;
     }
-    MPI_Reduce( in, out, 30, MPI_DOUBLE_INT, MPI_MAXLOC, root, comm );
+    MPI_Reduce(in, out, 30, MPI_DOUBLE_INT, MPI_MAXLOC, root, comm);
     /* At this point, the answer resides on process root
      */
     if (myrank == root) {
@@ -2933,8 +2972,8 @@
         in(2,i) = myrank    ! myrank is coerced to a double
     END DO
 
-    CALL MPI_REDUCE( in, out, 30, MPI_2DOUBLE_PRECISION, MPI_MAXLOC, root,
-                                                               comm, ierr )
+    CALL MPI_REDUCE(in, out, 30, MPI_2DOUBLE_PRECISION, MPI_MAXLOC, root,
+                                                               comm, ierr)
     ! At this point, the answer resides on process root
 
     IF (myrank .EQ. root) THEN
@@ -3209,14 +3248,14 @@
 case.
 \end{implementors}
 
-\begin{funcdef}{MPI\_OP\_FREE( op)}
+\begin{funcdef}{MPI\_OP\_FREE(op)}
 \funcarg{\INOUT}{op}{ operation (handle) }
 \end{funcdef}
 
 \cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}%
-\mpibind{MPI\_op\_free( MPI\_Op~*op)}
+\mpibind{MPI\_op\_free(MPI\_Op~*op)}
 
-\mpifbind{MPI\_OP\_FREE( OP, IERROR) \fargs INTEGER OP, IERROR}
+\mpifbind{MPI\_OP\_FREE(OP, IERROR) \fargs INTEGER OP, IERROR}
 \mpicppemptybind{MPI::Op::Free()}{void}
 
 Marks a user-defined reduction operation for deallocation and sets
@@ -3248,7 +3287,7 @@
 
 /* the user-defined function
  */
-void myProd( Complex *in, Complex *inout, int *len, MPI_Datatype *dptr )
+void myProd(Complex *in, Complex *inout, int *len, MPI_Datatype *dptr)
 {
     int i;
     Complex c;
@@ -3275,13 +3314,13 @@
 
     /* explain to MPI how type Complex is defined
      */
-    MPI_Type_contiguous( 2, MPI_DOUBLE, &ctype );
-    MPI_Type_commit( &ctype );
+    MPI_Type_contiguous(2, MPI_DOUBLE, &ctype);
+    MPI_Type_commit(&ctype);
     /* create the complex-product user-op
      */
     MPI_Op_create( myProd, 1, &myOp );
 
-    MPI_Reduce( a, answer, 100, ctype, myOp, root, comm );
+    MPI_Reduce(a, answer, 100, ctype, myOp, root, comm);
 
     /* At this point, the answer, which consists of 100 Complexes,
      * resides on process root
@@ -3306,7 +3345,7 @@
 participating in these operations
 receive identical results.
 
-\begin{funcdef}{MPI\_ALLREDUCE( sendbuf, recvbuf, count, datatype, op, comm)}
+\begin{funcdef}{MPI\_ALLREDUCE(sendbuf, recvbuf, count, datatype, op, comm)}
 \funcarg{\IN}{sendbuf}{ starting address of send buffer (choice)}
 \funcarg{\OUT}{recvbuf}{ starting address of receive buffer (choice)}
 \funcarg{\IN}{count}{ number of elements in send buffer (%
@@ -3613,7 +3652,7 @@
 \label{sec:coll-scan}
 \subsection{Inclusive Scan}
 
-\begin{funcdef}{MPI\_SCAN( sendbuf, recvbuf, count, datatype, op, comm )}
+\begin{funcdef}{MPI\_SCAN(sendbuf, recvbuf, count, datatype, op, comm)}
 \funcarg{\IN}{sendbuf}{starting address of send buffer (choice)}
 \funcarg{\OUT}{recvbuf}{ starting address of receive buffer (choice)}
 \funcarg{\IN}{count}{ number of elements in input buffer (%
@@ -3625,7 +3664,7 @@
 \end{funcdef}
 
 \cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}%
-\mpibind{MPI\_Scan(void*~sendbuf, void*~recvbuf, int~count, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm )}
+\mpibind{MPI\_Scan(void*~sendbuf, void*~recvbuf, int~count, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm)}
 
 \mpifbind{MPI\_SCAN(SENDBUF, RECVBUF, COUNT, DATATYPE, OP, COMM, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER COUNT, DATATYPE, OP, COMM, IERROR}
 \mpicppemptybind{MPI::Intracomm::Scan(const void*~sendbuf, void*~recvbuf, int~count, const MPI::Datatype\&~datatype, const~MPI::Op\&~op) const}{void}
@@ -3705,6 +3744,7 @@
 %}
 
 \subsection{Exclusive Scan}
+\label{subsec:coll-exscan}
 \label{coll-exscan} % Sect. 5.11.2 p.175 newlabel
 \status{Passed twice}
 
@@ -3841,14 +3881,14 @@
 
 /* the user-defined function
  */
-void segScan( SegScanPair *in, SegScanPair *inout, int *len,
-                                                MPI_Datatype *dptr )
+void segScan(SegScanPair *in, SegScanPair *inout, int *len,
+                                                MPI_Datatype *dptr)
 {
     int i;
     SegScanPair c;
 
     for (i=0; i< *len; ++i) {
-        if ( in->log == inout->log )
+        if (in->log == inout->log)
             c.val = in->val + inout->val;
         else
             c.val = inout->val;
@@ -3890,13 +3930,791 @@
     MPI_Type_commit( &sspair );
     /* create the segmented-scan user-op
      */
-    MPI_Op_create( segScan, 0, &myOp );
+    MPI_Op_create(segScan, 0, &myOp);
     ...
     MPI_Scan( &a, &answer, 1, sspair, myOp, comm );
 \end{Verbatim}
 
 } \end{example}
 
+
+
+
+\section{Nonblocking Collective Operations}
+As described in Section~\ref{sec:pt2pt-nonblock}, performance of many
+applications can be improved by overlapping communication and
+computation, and many systems enable this. Nonblocking
+collective operations combine the potential benefits of nonblocking
+point-to-point operations, to exploit overlap and to avoid
+synchronization, with the optimized implementation and message
+scheduling provided by collective
+operations~\cite{hoefler-app-parco,hoefler-europvm08-osem}.  One way of
+doing this would be to perform a blocking collective operation in a
+separate thread. An alternative mechanism that often leads to better
+performance (e.g., avoids context switching, scheduler overheads, and
+thread management) is to use nonblocking collective
+communication~\cite{hoefler-ib-threads}.
+
+The nonblocking collective communication model is similar to the model 
+used for nonblocking point-to-point communication. A nonblocking
+call initiates a collective operation, which must be       
+completed in a separate completion call. 
+Once initiated, the operation may progress independently of any
+computation or other communication at participating processes. In this
+manner, nonblocking collective operations can mitigate possible
+synchronizing effects of collective operations by running them in the
+``background.''
+In addition to enabling communication-computation
+overlap, nonblocking collective operations can perform
+collective operations on overlapping communicators, which would lead to
+deadlocks with blocking operations. Their semantic advantages can also be
+useful in combination with point-to-point communication.
+
+As in the nonblocking point-to-point case, all calls are local and
+return immediately, irrespective of the status of other processes. The
+call initiates the operation, which indicates that the system may
+start to copy data out of the send buffer and into the receive buffer.
+Once intiated, all associated send buffers should not be modified and
+all associated receive buffers should not be accessed until the
+collective operation completes. The
+call returns a request handle, which must be passed to a
+completion call.
+
+All completion calls (e.g., \mpifunc{MPI\_WAIT}) described in Section
+\ref{subsec:pt2pt-commend} are supported for nonblocking collective
+operations. Similarly to the blocking case, nonblocking collective operations are
+considered to be complete when the local part of the operation is
+finished, i.e., for the caller, the semantics of the operation are
+guaranteed and all buffers can be safely accessed and modified.
+Completion does not indicate that other processes have completed or even
+started the operation (unless otherwise implied by the description of
+the operation). Completion of a particular nonblocking collective
+operation also does not indicate completion of any other posted
+nonblocking collective (or send-receive) operations, whether they are
+posted before or after the completed operation.
+
+\begin{users}
+Users should be aware that implementations are allowed, but not required
+(with exception of \mpifunc{MPI\_IBARRIER}), to synchronize processes
+during the completion of a nonblocking collective operation.
+\end{users}
+
+Upon returning from a completion call in which a nonblocking collective
+operation completes, the \mpifunc{MPI\_ERROR} field in the associated
+status object is set appropriately. The values of
+the \mpifunc{MPI\_SOURCE} and \mpifunc{MPI\_TAG} fields are undefined.
+It is valid to mix different request types (i.e., any combination of
+collective requests, I/O requests, generalized  requests, or
+point-to-point requests) in functions that enable multiple completions
+(e.g., \mpifunc{MPI\_WAITALL}). It is erroneous to call
+\mpifunc{MPI\_REQUEST\_FREE} or \mpifunc{MPI\_CANCEL} for a request
+associated with a nonblocking collective operation.
+Nonblocking collective requests are not persistent.
+
+\begin{rationale}
+Freeing an active nonblocking collective request could cause similar
+problems as discussed for point-to-point requests (see Section~\ref{subsec:pt2pt-commend}).
+Cancelling a request is not supported because the semantics of this
+operation are not well-defined.
+\end{rationale}
+
+
+Multiple nonblocking collective
+operations can be outstanding on a single communicator. 
+If the nonblocking call causes some system resource to be exhausted,
+then it will fail and generate an MPI exception. Quality implementations
+of MPI should ensure that this happens only in pathological cases.
+That is, an MPI implementation should be able to support a large number
+of pending nonblocking operations.
+
+Unlike point-to-point operations, nonblocking collective operations do
+not match with blocking collective operations, and collective operations
+do not have a tag argument. All processes must call collective
+operations (blocking and nonblocking) in the same order per
+communicator. In particular, once a process calls a collective
+operation, all other processes in the communicator must eventually
+call the same collective operation, and no other collective operation
+with the same communicator 
+in between. This is consistent with the ordering rules for blocking
+collective operations in threaded environments.
+
+\begin{rationale}
+Matching blocking and nonblocking collective operations is not allowed
+because the implementation might use different communication algorithms
+for the two cases. Blocking collective operations may be optimized
+for minimal time to completion, while nonblocking collective operations
+may balance time to completion with CPU overhead and asynchronous
+progression. 
+
+The use of tags for collective operations can prevent certain hardware 
+optimizations.
+\end{rationale}
+
+\begin{users}
+If program semantics require matching blocking and nonblocking
+collective operations, then a nonblocking collective operation can be
+initiated and immediately completed with a blocking wait to emulate
+blocking behavior. 
+\end{users}
+
+In terms of data movements, each nonblocking collective operation has
+the same effect as its blocking counterpart for intracommunicators and
+intercommunicators after completion. Likewise, upon completion,
+nonblocking collective reduction operations have the same effect as
+their blocking counterparts, and the same restrictions and
+recommendations on reduction orders apply.
+
+The use of the ``in place'' option is allowed exactly as described for
+the corresponding blocking collective operations.  When using the ``in
+place'' option, message buffers function as both send and receive
+buffers. Such buffers should not be modified or accessed until the
+operation completes. 
+
+Progression rules for nonblocking collective operations are similar to
+progression of nonblocking point-to-point operations, refer to
+Section~\ref{subsec:pt2pt-semantics}.
+
+
+\begin{implementors}
+Nonblocking collective operations can be implemented with local
+execution schedules~\cite{hoefler-sc07} using nonblocking point-to-point
+communication and a reserved tag-space. 
+\end{implementors}
+
+
+\subsection{Nonblocking Barrier Synchronization}
+\label{sec:nbcoll-ibarrier}
+
+\begin{funcdef}{MPI\_IBARRIER(comm , request)}
+\funcarg{\IN}{comm}{communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\mpibind{MPI\_Ibarrier(MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_IBARRIER(COMM, REQUEST, IERROR) \fargs INTEGER COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Ibarrier() const~=~0}{MPI::Request}
+
+
+
+
+\mpifunc{MPI\_IBARRIER} is a nonblocking version of
+\mpifunc{MPI\_BARRIER}. By calling \mpifunc{MPI\_IBARRIER}, a process
+notifies that it has reached the barrier. The call returns immediately,
+independent of whether other processes have called \mpifunc{MPI\_IBARRIER}.
+The usual barrier semantics are enforced at the corresponding completion
+operation (test or wait), which in the intracommunicator case will
+complete only after all other processes in the communicator have called
+\mpifunc{MPI\_IBARRIER}. In the intercommunicator case, it will complete
+when all processes in the remote group have called
+\mpifunc{MPI\_IBARRIER}.
+
+\begin{users}
+A nonblocking barrier can be used to hide latency. Moving
+independent computations between the \mpifunc{MPI\_IBARRIER} and the
+subsequent completion call can overlap the barrier latency and therefore
+shorten possible waiting times. The semantic properties are also useful
+when mixing collective operations and point-to-point messages.
+\end{users}
+
+
+\subsection{Nonblocking Broadcast}
+\label{sec:nbcoll-ibroadcast}
+
+\begin{funcdef}{MPI\_IBCAST(buffer, count, datatype, root, comm, request)}
+\funcarg{\INOUT}{ buffer}{starting address of buffer (choice)}
+\funcarg{\IN}{ count}{ number of entries in buffer (%
+non-negative
+integer)}
+\funcarg{\IN}{ datatype}{ data type of buffer (handle)}
+\funcarg{\IN}{ root}{ rank of broadcast root (integer)}
+\funcarg{\IN}{ comm}{ communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\mpibind{MPI\_Ibcast(void*~buffer, int~count, MPI\_Datatype~datatype, int~root, MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_IBCAST(BUFFER, COUNT, DATATYPE, ROOT, COMM, REQUEST, IERROR) \fargs <type>  BUFFER(*) \\ INTEGER COUNT, DATATYPE, ROOT, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Ibcast(void*~buffer, int~count, const~MPI::Datatype\&~datatype, int~root) const~=~0}{MPI::Request}
+
+
+
+
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_BCAST} (see
+Section~\ref{sec:coll-broadcast}). 
+
+\subsubsection{Example using \func{MPI\_IBCAST}}
+
+The example in this section uses intracommunicators.
+
+\begin{example} {\rm
+\label{coll-exZ}
+\exindex{MPI\_Bcast}
+
+Start a broadcast of 100 {\tt int}s from process {\tt 0} to every process in the
+group, perform some computation on independent data, and then complete
+the outstanding broadcast operation.
+
+\begin{verbatim}
+    MPI_Comm comm;
+    int array1[100], array2[100];
+    int root=0;
+    MPI_Request req;
+    ...
+    MPI_Ibcast(array1, 100, MPI_INT, root, comm, &req);
+    compute(array2, 100);
+    MPI_Wait(&req, MPI_STATUS_IGNORE);
+\end{verbatim}
+} \end{example}
+
+
+\subsection{Nonblocking Gather}
+\label{sec:nbcoll-igather}
+
+\begin{funcdef}{MPI\_IGATHER(sendbuf, sendcount, sendtype, recvbuf,
+recvcount, recvtype, root, comm, request) }
+\funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)}
+\funcarg{\IN}{ sendcount}{ number of elements in send buffer (%
+non-negative
+integer)}
+\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)}
+\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice,
+significant only at root)}
+\funcarg{\IN}{ recvcount}{ number of elements for any single receive (%
+non-negative
+integer, significant only at root)}
+\funcarg{\IN}{ recvtype}{ data type of recv buffer elements
+(significant only at root) (handle)}
+\funcarg{\IN}{ root}{ rank of receiving process (integer)}
+\funcarg{\IN}{ comm}{ communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\mpibind{MPI\_Igather(void*~sendbuf, int~sendcount, MPI\_Datatype~sendtype, void*~recvbuf, int~recvcount, MPI\_Datatype~recvtype, int~root, MPI\_Comm~comm, MPI\_Request~*request) }
+
+\mpifbind{MPI\_IGATHER(SENDBUF, SENDCOUNT, SENDTYPE, RECVBUF, RECVCOUNT, RECVTYPE, ROOT, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNT, SENDTYPE, RECVCOUNT, RECVTYPE, ROOT, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Igather(const void*~sendbuf, int~sendcount, const MPI::Datatype\&~sendtype, void*~recvbuf, int~recvcount, const~MPI::Datatype\&~recvtype, int~root) const~=~0}{MPI::Request}
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_GATHER} (see
+Section~\ref{sec:coll-gather}). 
+
+\begin{funcdef}{MPI\_IGATHERV(sendbuf, sendcount, sendtype, recvbuf,
+recvcounts, displs, recvtype, root, comm, request) }
+\funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)}
+\funcarg{\IN}{ sendcount}{ number of elements in send buffer (%
+non-negative
+integer)}
+\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)}
+\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice,
+significant only at root)}
+\funcarg{\IN}{ recvcounts}{% 
+non-negative
+integer array (of length group size)
+containing the number of elements that are received from each process
+(significant only at root)}
+\funcarg{\IN}{ displs}{ integer array (of length group size).  Entry
+{\tt i} specifies the displacement relative to \mpiarg{recvbuf} at
+which to place the incoming data from process {\tt i} (significant only
+at root)}
+\funcarg{\IN}{ recvtype}{ data type of recv buffer elements
+(significant only at root) (handle)}
+\funcarg{\IN}{ root}{ rank of receiving process (integer)}
+\funcarg{\IN}{ comm}{ communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\mpibind{MPI\_Igatherv(void*~sendbuf, int~sendcount, MPI\_Datatype~sendtype, void*~recvbuf, int~*recvcounts, int~*displs, MPI\_Datatype~recvtype, int~root, MPI\_Comm~comm, MPI\_Request~*request) }
+
+\mpifbind{MPI\_IGATHERV(SENDBUF, SENDCOUNT, SENDTYPE, RECVBUF, RECVCOUNTS, DISPLS, RECVTYPE, ROOT, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNT, SENDTYPE, RECVCOUNTS(*), DISPLS(*), RECVTYPE, ROOT, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Igatherv(const void*~sendbuf, int~sendcount, const MPI::Datatype\&~sendtype, void*~recvbuf, const~int~recvcounts[], const~int~displs[], const~MPI::Datatype\&~recvtype, int~root) const~=~0}{MPI::Request}
+
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_GATHERV} (see
+Section~\ref{sec:coll-gather}). 
+
+
+
+\subsection{Nonblocking Scatter}
+\label{sec:nbcoll-iscatter}
+
+\begin{funcdef}{MPI\_ISCATTER(sendbuf, sendcount, sendtype, recvbuf,
+recvcount, recvtype, root, comm, request)}
+\funcarg{\IN}{ sendbuf}{ address of send buffer (choice, significant
+only at root)}
+\funcarg{\IN}{ sendcount}{ number of elements sent to each process (%
+non-negative
+integer, significant only at root)}
+\funcarg{\IN}{ sendtype}{ data type of send buffer elements
+(significant only at root) (handle)}
+\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)}
+\funcarg{\IN}{ recvcount}{ number of elements in receive buffer (%
+non-negative
+integer)}
+\funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)}
+\funcarg{\IN}{ root}{  rank of sending process (integer)}
+\funcarg{\IN}{ comm}{ communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\mpibind{MPI\_Iscatter(void*~sendbuf, int~sendcount, MPI\_Datatype~sendtype, void*~recvbuf, int~recvcount, MPI\_Datatype~recvtype, int~root, MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_ISCATTER(SENDBUF, SENDCOUNT, SENDTYPE, RECVBUF, RECVCOUNT, RECVTYPE, ROOT, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNT, SENDTYPE, RECVCOUNT, RECVTYPE, ROOT, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Iscatter(const void*~sendbuf, int~sendcount, const MPI::Datatype\&~sendtype, void*~recvbuf, int~recvcount, const~MPI::Datatype\&~recvtype, int~root) const~=~0}{MPI::Request}
+
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_SCATTER} (see
+Section~\ref{sec:coll-scatter}). 
+
+
+
+\begin{funcdef}{MPI\_ISCATTERV(sendbuf, sendcounts, displs, sendtype,
+recvbuf, recvcount, recvtype, root, comm, request)}
+\funcarg{\IN}{ sendbuf}{ address of send buffer (choice, significant
+only at root)}
+\funcarg{\IN}{ sendcounts}{% 
+non-negative
+integer array (of length group size)
+specifying the number of elements to send to each processor }
+\funcarg{\IN}{ displs}{ integer array (of length group size).  Entry
+{\tt i} specifies the displacement (relative to \mpiarg{sendbuf}) from
+which to take the outgoing data to process {\tt i}}
+\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)}
+\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)}
+\funcarg{\IN}{ recvcount}{ number of elements in receive buffer (%
+non-negative
+integer)}
+\funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)}
+\funcarg{\IN}{ root}{  rank of sending process (integer)}
+\funcarg{\IN}{ comm}{ communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\mpibind{MPI\_Iscatterv(void*~sendbuf, int~*sendcounts, int~*displs, MPI\_Datatype~sendtype, void*~recvbuf, int~recvcount, MPI\_Datatype~recvtype, int~root, MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_ISCATTERV(SENDBUF, SENDCOUNTS, DISPLS, SENDTYPE, RECVBUF, RECVCOUNT, RECVTYPE, ROOT, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNTS(*), DISPLS(*), SENDTYPE, RECVCOUNT, RECVTYPE, ROOT, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Iscatterv(const void*~sendbuf, const~int~sendcounts[], const~int~displs[], const~MPI::Datatype\&~sendtype, void*~recvbuf, int~recvcount, const~MPI::Datatype\&~recvtype, int~root) const~=~0}{MPI::Request}
+
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_SCATTERV} (see
+Section~\ref{sec:coll-scatter}). 
+
+
+\subsection{Nonblocking Gather-to-all}
+\label{sec:nbcoll-iallcast}
+
+\begin{funcdef}{MPI\_IALLGATHER(sendbuf, sendcount, sendtype, recvbuf,
+recvcount, recvtype, comm, request)}
+\funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)}
+\funcarg{\IN}{ sendcount}{ number of elements in send buffer (%
+non-negative
+integer)}
+\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)}
+\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)}
+\funcarg{\IN}{ recvcount}{ number of elements received from any process (%
+non-negative
+integer)}
+\funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)}
+\funcarg{\IN}{ comm}{  communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\mpibind{MPI\_Iallgather(void*~sendbuf, int~sendcount, MPI\_Datatype~sendtype, void*~recvbuf, int~recvcount, MPI\_Datatype~recvtype, MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_IALLGATHER(SENDBUF, SENDCOUNT, SENDTYPE, RECVBUF, RECVCOUNT, RECVTYPE, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNT, SENDTYPE, RECVCOUNT, RECVTYPE, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Iallgather(const void*~sendbuf, int~sendcount, const MPI::Datatype\&~sendtype, void*~recvbuf, int~recvcount, const~MPI::Datatype\&~recvtype) const~=~0}{MPI::Request}
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_ALLGATHER}
+(see Section~\ref{sec:coll-allcast}). 
+
+
+\begin{funcdef}{MPI\_IALLGATHERV(sendbuf, sendcount, sendtype, recvbuf,
+recvcounts, displs, recvtype, comm, request)}
+\funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)}
+\funcarg{\IN}{ sendcount}{ number of elements in send buffer (%
+non-negative
+integer)}
+\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)}
+\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)}
+\funcarg{\IN}{ recvcounts}{% 
+non-negative
+integer array (of length group size)
+containing the number of elements that are received from each process}
+\funcarg{\IN}{ displs}{ integer array (of length group size).  Entry
+{\tt i} specifies the displacement (relative to \mpiarg{recvbuf}) at
+which to place the incoming data from process {\tt i}}
+\funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)}
+\funcarg{\IN}{ comm}{  communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\mpibind{MPI\_Iallgatherv(void*~sendbuf, int~sendcount,
+MPI\_Datatype~sendtype, void*~recvbuf, int~*recvcounts, int~*displs,
+MPI\_Datatype~recvtype, MPI\_Comm~comm, MPI\_Request *request)}
+
+\mpifbind{MPI\_IALLGATHERV(SENDBUF, SENDCOUNT, SENDTYPE, RECVBUF, RECVCOUNTS, DISPLS, RECVTYPE, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNT, SENDTYPE, RECVCOUNTS(*), DISPLS(*), RECVTYPE, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Iallgatherv(const void*~sendbuf, int~sendcount,  const MPI::Datatype\&~sendtype, void*~recvbuf, const~int~recvcounts[], const~int~displs[], const~MPI::Datatype\&~recvtype) const~=~0}{MPI::Request}
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_ALLGATHERV} (see
+Section~\ref{sec:coll-allcast}). 
+
+
+
+\subsection{Nonblocking All-to-All Scatter/Gather}
+\label{sec:nbcoll-ialltoall}
+
+\begin{funcdef}{MPI\_IALLTOALL(sendbuf, sendcount, sendtype, recvbuf,
+recvcount, recvtype, comm, request)}
+\funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)}
+\funcarg{\IN}{ sendcount}{ number of elements sent to each process (%
+non-negative
+integer)}
+\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)}
+\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)}
+\funcarg{\IN}{ recvcount}{ number of elements received from any process (%
+non-negative
+integer)}
+\funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)}
+\funcarg{\IN}{ comm}{ communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\mpibind{MPI\_Ialltoall(void*~sendbuf, int~sendcount, MPI\_Datatype~sendtype, void*~recvbuf, int~recvcount, MPI\_Datatype~recvtype, MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_IALLTOALL(SENDBUF, SENDCOUNT, SENDTYPE, RECVBUF, RECVCOUNT, RECVTYPE, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNT, SENDTYPE, RECVCOUNT, RECVTYPE, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Ialltoall(const void*~sendbuf, int~sendcount, const MPI::Datatype\&~sendtype, void*~recvbuf, int~recvcount, const~MPI::Datatype\&~recvtype) const~=~0}{MPI::Request}
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_ALLTOALL} (see
+Section~\ref{sec:coll-alltoall}). 
+
+\begin{funcdef}{MPI\_IALLTOALLV(sendbuf, sendcounts, sdispls, sendtype,
+recvbuf, recvcounts, rdispls, recvtype, comm, request)}
+\funcarg{\IN}{ sendbuf}{ starting address of send buffer (choice)}
+\funcarg{\IN}{ sendcounts}{% 
+non-negative
+integer array (of length group size) 
+specifying the number of elements to send to each processor}
+\funcarg{\IN}{ sdispls}{ integer array (of length group size).  Entry
+{\tt j} specifies the displacement (relative to \mpiarg{sendbuf}) from
+which to take the outgoing data destined for process {\tt j}}
+\funcarg{\IN}{ sendtype}{ data type of send buffer elements (handle)}
+\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice)}
+\funcarg{\IN}{ recvcounts}{%
+non-negative
+integer array (of length group size) 
+specifying the number of elements that can be received from
+each processor}
+\funcarg{\IN}{ rdispls}{ integer array (of length group size).  Entry
+{\tt i} specifies the displacement (relative to \mpiarg{recvbuf}) at
+which to place the incoming data from process {\tt i}}
+\funcarg{\IN}{ recvtype}{ data type of receive buffer elements (handle)}
+\funcarg{\IN}{ comm}{ communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\mpibind{MPI\_Ialltoallv(void*~sendbuf, int~*sendcounts, int~*sdispls, MPI\_Datatype~sendtype, void*~recvbuf, int~*recvcounts, int~*rdispls, MPI\_Datatype~recvtype, MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_IALLTOALLV(SENDBUF, SENDCOUNTS, SDISPLS, SENDTYPE, RECVBUF, RECVCOUNTS, RDISPLS, RECVTYPE, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER SENDCOUNTS(*), SDISPLS(*), SENDTYPE, RECVCOUNTS(*), RDISPLS(*), RECVTYPE, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Ialltoallv(const void*~sendbuf, const~int~sendcounts[], const~int~sdispls[], const~MPI::Datatype\&~sendtype, void*~recvbuf,  const~int~recvcounts[], const~int~rdispls[],  const~MPI::Datatype\&~recvtype) const~=~0}{MPI::Request}
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_ALLTOALLV} (see
+Section~\ref{sec:coll-alltoall}). 
+
+
+\begin{funcdef}{MPI\_IALLTOALLW(sendbuf, sendcounts, sdispls, sendtypes,
+recvbuf, recvcounts, rdispls, recvtypes, comm, request)}
+\funcarg{\IN}{sendbuf}{starting address of send buffer (choice)}
+\funcarg{\IN}{sendcounts}{integer array (of length group size) specifying the
+number of elements to send to each processor (array of 
+non-negative
+integers)}
+\funcarg{\IN}{sdispls}{integer array (of length group size). Entry {\tt j} specifies
+the displacement in bytes (relative to \mpiarg{sendbuf}) from which to take
+the outgoing data destined for process {\tt j} (array of integers)}
+\funcarg{\IN}{sendtypes}{array of datatypes (of length group size). Entry {\tt j}
+specifies the type of data to send to process {\tt j} (array of handles)}
+\funcarg{\OUT}{recvbuf}{address of receive buffer (choice)}
+\funcarg{\IN}{recvcounts}{integer array (of length group size) specifying the
+number of elements that can be received from each processor (array of 
+non-negative
+integers)}
+\funcarg{\IN}{rdispls}{integer array (of length group size). Entry {\tt i} specifies
+the displacement in bytes (relative to \mpiarg{recvbuf}) at which to place the
+incoming data from process {\tt i} (array of integers)}
+\funcarg{\IN}{recvtypes}{array of datatypes (of length group size). Entry {\tt i}
+specifies the type of data received from process {\tt i} (array of handles)}
+\funcarg{\IN}{comm}{communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\mpibind{MPI\_Ialltoallw(void~*sendbuf, int~sendcounts[], int~sdispls[], MPI\_Datatype~sendtypes[], void~*recvbuf, int~recvcounts[], int~rdispls[], MPI\_Datatype~recvtypes[], MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_IALLTOALLW(SENDBUF, SENDCOUNTS, SDISPLS, SENDTYPES, RECVBUF, RECVCOUNTS, RDISPLS, RECVTYPES, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*)\\INTEGER SENDCOUNTS(*), SDISPLS(*), SENDTYPES(*), RECVCOUNTS(*), RDISPLS(*), RECVTYPES(*), COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Ialltoallw(const void* sendbuf, const int sendcounts[], const int sdispls[], const MPI::Datatype sendtypes[], void* recvbuf, const int recvcounts[], const int rdispls[], const MPI::Datatype recvtypes[]) const~=~0}{MPI::Request}
+
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_ALLTOALLW} (see
+Section~\ref{sec:coll-alltoall}). 
+
+\subsection{Nonblocking Reduce}
+\label{subsec:nbcoll-ireduce}
+
+\begin{funcdef}{MPI\_IREDUCE(sendbuf, recvbuf, count, datatype, op,
+root, comm, request)}
+\funcarg{\IN}{ sendbuf}{ address of send buffer (choice)}
+\funcarg{\OUT}{ recvbuf}{ address of receive buffer (choice,
+significant only at root)}
+\funcarg{\IN}{ count}{ number of elements in send buffer (%
+non-negative
+integer)}
+\funcarg{\IN}{ datatype}{ data type of elements of send buffer (handle)}
+\funcarg{\IN}{ op}{ reduce operation (handle)}
+\funcarg{\IN}{ root}{ rank of root process (integer)}
+\funcarg{\IN}{ comm}{  communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}%
+\mpibind{MPI\_Ireduce(void*~sendbuf, void*~recvbuf, int~count, MPI\_Datatype~datatype, MPI\_Op~op, int~root, MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_IREDUCE(SENDBUF, RECVBUF, COUNT, DATATYPE, OP, ROOT, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER COUNT, DATATYPE, OP, ROOT, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Ireduce(const void*~sendbuf, void*~recvbuf, int~count, const~MPI::Datatype\&~datatype, const~MPI::Op\&~op, int~root) const~=~0}{MPI::Request}
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_REDUCE} (see
+Section~\ref{subsec:coll-reduce}). 
+
+
+\begin{implementors}
+The implementation is explicitly allowed to use different algorithms for
+blocking and nonblocking reduction operations that might change the
+order of evaluation of the operations. However, as for
+\mpifunc{MPI\_REDUCE}, it is strongly recommended that
+\mpifunc{MPI\_IREDUCE} be implemented so that the same result be
+obtained whenever the function is applied on the same arguments,
+appearing in the same order. Note that this may prevent optimizations
+that take advantage of the physical location of processes. 
+\end{implementors}
+
+\begin{users}
+For operations which are not truly associative, the result delivered
+upon completion of the nonblocking reduction may not exactly equal the
+result delivered by the blocking reduction, even when specifying the
+same arguments in the same order. 
+\end{users}
+
+
+
+\subsection{Nonblocking All-Reduce}
+\label{subsec:nbcoll-all-reduce}
+
+\MPI/ includes 
+a variant
+of the reduce operations
+where the result is returned to all processes in
+a
+group.
+\MPI/ requires that all processes
+from the same group
+participating in these operations
+receive identical results.
+
+\begin{funcdef}{MPI\_IALLREDUCE(sendbuf, recvbuf, count, datatype, op, comm, request)}
+\funcarg{\IN}{sendbuf}{ starting address of send buffer (choice)}
+\funcarg{\OUT}{recvbuf}{ starting address of receive buffer (choice)}
+\funcarg{\IN}{count}{ number of elements in send buffer (%
+non-negative
+integer)}
+\funcarg{\IN}{datatype}{ data type of elements of send buffer (handle)}
+\funcarg{\IN}{op}{ operation (handle)}
+\funcarg{\IN}{comm}{ communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}%
+\mpibind{MPI\_Iallreduce(void*~sendbuf, void*~recvbuf, int~count, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_IALLREDUCE(SENDBUF, RECVBUF, COUNT, DATATYPE, OP, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER COUNT, DATATYPE, OP, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Iallreduce(const void*~sendbuf, void*~recvbuf,  int~count, const MPI::Datatype\&~datatype, const~MPI::Op\&~op) const~=~0}{MPI::Request}
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_ALLREDUCE} (see
+Section~\ref{subsec:coll-all-reduce}). 
+
+
+
+
+
+\subsection{Nonblocking Reduce-Scatter with Equal Blocks}
+\label{sec:nbcoll-reduce-scatter}
+
+\begin{funcdef}{MPI\_IREDUCE\_SCATTER\_BLOCK(sendbuf, recvbuf, recvcount,
+datatype, op, comm, request)}
+\funcarg{\IN}{sendbuf}{ starting address of send buffer (choice)}
+\funcarg{\OUT}{recvbuf}{ starting address of receive buffer (choice)}
+\funcarg{\IN}{recvcount}{ element count per block (non-negative integer)}
+\funcarg{\IN}{datatype}{ data type of elements of send and receive buffers (handle)}
+\funcarg{\IN}{op}{ operation (handle)}
+\funcarg{\IN}{comm}{ communicator (handle)}
+\funcarg{\OUT}{request}{ communication request (handle)}
+\end{funcdef}
+
+\cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}%
+\mpibind{MPI\_Ireduce\_scatter\_block(void*~sendbuf, void*~recvbuf, int~recvcount, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_IREDUCE\_SCATTER\_BLOCK(SENDBUF, RECVBUF, RECVCOUNT, DATATYPE, OP, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER RECVCOUNT, DATATYPE, OP, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Ireduce\_scatter\_block(const void*~sendbuf, void*~recvbuf, int~recvcount, const~MPI::Datatype\&~datatype, const~MPI::Op\&~op) const~=~0}{MPI::Request}
+
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_REDUCE\_SCATTER\_BLOCK}
+(see Section~\ref{subsec:coll-reduce-scatter-block}). 
+
+
+
+
+\subsection{Nonblocking Reduce-Scatter}
+\label{sec:nbcoll-reduce-scatter}
+
+\begin{funcdef}{MPI\_IREDUCE\_SCATTER(sendbuf, recvbuf, recvcounts,
+datatype, op, comm, request)}
+\funcarg{\IN}{sendbuf}{ starting address of send buffer (choice)}
+\funcarg{\OUT}{recvbuf}{ starting address of receive buffer (choice)}
+\funcarg{\IN}{recvcounts}{% 
+non-negative
+integer array specifying the
+number of elements in result distributed to each process.
+Array must be identical on all calling processes.}
+\funcarg{\IN}{datatype}{ data type of elements of input buffer (handle)}
+\funcarg{\IN}{op}{ operation (handle)}
+\funcarg{\IN}{comm}{ communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}%
+\mpibind{MPI\_Ireduce\_scatter(void*~sendbuf, void*~recvbuf, int~*recvcounts, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_IREDUCE\_SCATTER(SENDBUF, RECVBUF, RECVCOUNTS, DATATYPE, OP, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER RECVCOUNTS(*), DATATYPE, OP, COMM, REQUEST, IERROR}
+
+
+\mpicppemptybind{MPI::Comm::Ireduce\_scatter(const void*~sendbuf, void*~recvbuf, int~recvcounts[], const~MPI::Datatype\&~datatype, const~MPI::Op\&~op) const~=~0}{MPI::Request}
+
+
+
+This call starts a nonblocking variant of \mpifunc{MPI\_REDUCE\_SCATTER}
+(see Section~\ref{subsec:coll-reduce-scatter}). 
+
+
+
+
+
+\subsection{Nonblocking Inclusive Scan}
+\label{subsec:nbcoll-iscan}
+
+\begin{funcdef}{MPI\_ISCAN(sendbuf, recvbuf, count, datatype, op, comm, request)}
+\funcarg{\IN}{sendbuf}{starting address of send buffer (choice)}
+\funcarg{\OUT}{recvbuf}{ starting address of receive buffer (choice)}
+\funcarg{\IN}{count}{ number of elements in input buffer (%
+non-negative
+integer)}
+\funcarg{\IN}{datatype}{ data type of elements of input buffer (handle)}
+\funcarg{\IN}{op}{ operation (handle)}
+\funcarg{\IN}{comm}{ communicator (handle)}
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}%
+\mpibind{MPI\_Iscan(void*~sendbuf, void*~recvbuf, int~count, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm, MPI\_Request~*request)}
+
+\mpifbind{MPI\_ISCAN(SENDBUF, RECVBUF, COUNT, DATATYPE, OP, COMM, REQUEST, IERROR) \fargs <type> SENDBUF(*), RECVBUF(*) \\ INTEGER COUNT, DATATYPE, OP, COMM, REQUEST, IERROR}
+\mpicppemptybind{MPI::Intracomm::Iscan(const void*~sendbuf, void*~recvbuf, int~count, const MPI::Datatype\&~datatype, const~MPI::Op\&~op) const}{MPI::Request}
+
+This call starts a nonblocking variant of \mpifunc{MPI\_SCAN} (see
+Section~\ref{sec:coll-scan}). 
+
+
+
+
+
+
+
+\subsection{Nonblocking Exclusive Scan}
+\label{subsec:nbcoll-iexscan}
+
+
+\begin{funcdef}{MPI\_IEXSCAN(sendbuf, recvbuf, count, datatype, op, comm, request)} 
+\funcarg{\IN}{sendbuf}{starting address of send buffer (choice) }
+\funcarg{\OUT}{recvbuf}{starting address of receive buffer (choice) }
+\funcarg{\IN}{count}{number of elements in input buffer (%
+non-negative
+integer) }
+\funcarg{\IN}{datatype}{data type of elements of input buffer (handle) }
+\funcarg{\IN}{op}{operation (handle) }
+\funcarg{\IN}{comm}{ intracommunicator (handle) }
+\funcarg{\OUT}{request}{communication request (handle)}
+\end{funcdef}
+
+\cdeclindex{MPI\_Op}\cdeclindex{MPI::Op}%
+\mpibind{MPI\_Iexscan(void~*sendbuf, void~*recvbuf, int~count, MPI\_Datatype~datatype, MPI\_Op~op, MPI\_Comm~comm, MPI\_Request~*request) }
+
+\mpifbind{MPI\_IEXSCAN(SENDBUF, RECVBUF, COUNT, DATATYPE, OP, COMM, REQUEST, IERROR) \fargs<type> SENDBUF(*), RECVBUF(*) \\INTEGER COUNT, DATATYPE, OP, COMM, REQUEST, IERROR }
+
+\mpicppemptybind{MPI::Intracomm::Iexscan(const void* sendbuf, void* recvbuf, int count, const MPI::Datatype\& datatype, const MPI::Op\& op) const}{MPI::Request}
+
+This call starts a nonblocking variant of \mpifunc{MPI\_EXSCAN} (see
+Section~\ref{subsec:coll-exscan}). 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 \section{Correctness}
 \label{coll:correct}
 
@@ -3908,18 +4726,11 @@
 
 \begin{example} {\rm
 \label{coll-excorr1}
-\exindex{Deadlock!with MPI\_Bcast}%
-\exindex{MPI\_Bcast}%
+\exindex{Deadlock with MPI\_Bcast}
+\exindex{MPI\_Bcast}
 
 The following is erroneous.
 
-%%HEADER
-%%LANG: C
-%%FRAGMENT
-%%DECL: int rank, count, *buf1, *buf2; MPI_Comm comm; 
-%%DECL: MPI_Datatype type;
-%%SKIPELIPSIS
-%%ENDHEADER
 \begin{verbatim}
 switch(rank) {
     case 0:
@@ -3946,8 +4757,8 @@
 
 \begin{example} {\rm
 \label{coll-excorr2}
-\exindex{Deadlock!with MPI\_Bcast}%
-\exindex{MPI\_Bcast}%
+\exindex{Deadlock with MPI\_Bcast}
+\exindex{MPI\_Bcast}
 
 The following is erroneous.
 
@@ -3985,15 +4796,16 @@
 Thus, the code will deadlock.
 
 Collective operations must be executed in an order so that
-no cyclic dependences occur.
+no cyclic dependencies occur. Nonblocking collective operations can
+alleviate this issue.
 }
 \end{example}
 
 
 \begin{example} {\rm
 \label{coll-exM}
-\exindex{Deadlock!with MPI\_Bcast}%
-\exindex{MPI\_Bcast}%
+\exindex{Deadlock with MPI\_Bcast}
+\exindex{MPI\_Bcast}
 
 The following is erroneous.
 
@@ -4036,8 +4848,8 @@
 
 \begin{example} {\rm
 \label{coll-exN}
-\exindex{Non-deterministic program with MPI\_Bcast}%
-\exindex{MPI\_Bcast}%
+\exindex{Non-deterministic program with MPI\_Bcast}
+\exindex{MPI\_Bcast}
 
 % A correct, but 
 An unsafe, 
@@ -4146,3 +4958,331 @@
 
 
 
+
+
+
+\begin{example} {\rm
+\label{coll-nbexN}
+\exindex{Mixing blocking and nonblocking collective operations}
+\exindex{MPI\_Ibarrier}
+\exindex{MPI\_Bcast}
+\exindex{MPI\_Wait}
+
+Blocking and nonblocking collective operations can be interleaved, i.e.,
+a blocking collective operation can be posted even if there is a
+nonblocking collective operation outstanding.
+
+\begin{verbatim}
+MPI_Request req;
+
+MPI_Ibarrier(comm, &req);
+MPI_Bcast(buf1, count, type, 0, comm);
+MPI_Wait(&req, MPI_STATUS_IGNORE);
+\end{verbatim}
+
+Each process starts a nonblocking barrier operation, participates in a
+blocking broadcast and then waits until every other process started the
+barrier operation. This effectively turns the broadcast into a
+synchronizing broadcast with possible communication/communication
+overlap (\mpifunc{MPI\_Bcast} is allowed, but not required to
+synchronize). 
+
+}
+\end{example}
+
+
+\begin{example} {\rm
+\label{coll-nbexN}
+\exindex{False matching of collective operations}
+\exindex{MPI\_Ibarrier}
+\exindex{MPI\_Bcast}
+\exindex{MPI\_Wait}
+
+The starting order of collective operations on a particular communicator
+defines their matching. The following example shows an erroneous
+matching of different collective operations on the same communicator.
+
+\begin{verbatim}
+MPI_Request req;
+switch(rank) {
+    case 0:
+        /* erroneous matching */
+        MPI_Ibarrier(comm, &req);
+        MPI_Bcast(buf1, count, type, 0, comm);
+        MPI_Wait(&req, MPI_STATUS_IGNORE);
+        break;
+    case 1:
+        /* erroneous matching */
+        MPI_Bcast(buf1, count, type, 0, comm);
+        MPI_Ibarrier(comm, &req);
+        MPI_Wait(&req, MPI_STATUS_IGNORE);
+        break;
+}
+\end{verbatim}
+
+This ordering would match \mpifunc{MPI\_Ibarrier} on rank 0 with
+\mpifunc{MPI\_Bcast} on rank 1 which is erroneous and the program behavior is
+undefined. However, if such an order is required, the user must create
+different duplicate communicators and perform the operations on them.
+If started with two processes, the following program would be legal:
+
+\begin{verbatim}
+MPI_Request req;
+MPI_Comm dupcomm;
+MPI_Comm_dup(comm, &dupcomm);
+switch(rank) {
+    case 0:
+        MPI_Ibarrier(comm, &req);
+        MPI_Bcast(buf1, count, type, 0, dupcomm);
+        MPI_Wait(&req, MPI_STATUS_IGNORE);
+        break;
+    case 1:
+        MPI_Bcast(buf1, count, type, 0, dupcomm);
+        MPI_Ibarrier(comm, &req);
+        MPI_Wait(&req, MPI_STATUS_IGNORE);
+        break;
+}
+\end{verbatim}
+
+\begin{users}
+The use of different communicators offers some flexibility regarding the
+matching of nonblocking collective operations. In this sense,
+communicators could be used as an equivalent to tags. However,
+communicator construction might induce overheads so that this should be
+used carefully.
+\end{users}
+
+}
+\end{example}
+
+
+\begin{example} {\rm
+\label{coll-nbexN}
+\exindex{Progression of nonblocking collective operations}
+\exindex{MPI\_Ibarrier}
+\exindex{MPI\_Send}
+\exindex{MPI\_Recv}
+\exindex{MPI\_Wait}
+
+Nonblocking collective operations can rely on the same progression rules
+as nonblocking point-to-point messages. Thus, if started with two
+processes, the following program is a valid MPI program and is
+guaranteed to terminate:
+
+\begin{verbatim}
+MPI_Request req;
+
+switch(rank) {
+    case 0:
+      MPI_Ibarrier(comm, &req);
+      MPI_Wait(&req, MPI_STATUS_IGNORE);
+      MPI_Send(buf, count, dtype, 1, tag, comm);
+      break;
+    case 1:
+      MPI_Ibarrier(comm, &req);
+      MPI_Recv(buf, count, dtype, 0, tag, comm, MPI_STATUS_IGNORE);
+      MPI_Wait(&req, MPI_STATUS_IGNORE);
+      break;
+}
+\end{verbatim}
+
+The MPI library must progress the barrier in the \mpifunc{MPI\_Recv}
+call. Thus, the \mpifunc{MPI\_Wait} call in rank 0 will eventually
+complete, which enables the matching \mpifunc{MPI\_Send} so all calls
+eventually return.                   
+
+
+
+
+}
+\end{example}
+
+\begin{example} {\rm
+\label{coll-nbexN}
+\exindex{No Matching of Blocking and Nonblocking collective operations}
+\exindex{MPI\_Ialltoall}
+\exindex{MPI\_Alltoall}
+\exindex{MPI\_Wait}
+
+Blocking and nonblocking collective operations do not match. The
+following example is illegal.
+
+\begin{verbatim}
+MPI_Request req;
+
+switch(rank) {
+    case 0:
+      /* illegal false matching of Alltoall and Ialltoall */
+      MPI_Ialltoall(sbuf, scnt, stype, rbuf, rcnt, rtype, comm, &req);
+      MPI_Wait(&req, MPI_STATUS_IGNORE);
+      break;
+    case 1:
+      /* illegal false matching of Alltoall and Ialltoall */
+      MPI_Alltoall(sbuf, scnt, stype, rbuf, rcnt, rtype, comm);
+      break;
+}
+\end{verbatim}
+
+}
+\end{example}
+
+
+
+\begin{example} {\rm
+\label{coll-nbexN}
+\exindex{Mixing collective and point-to-point requests}
+\exindex{MPI\_Ibarrier}
+\exindex{MPI\_Send}
+\exindex{MPI\_Irecv}
+\exindex{MPI\_Waitall}
+\exindex{MPI\_Wait}
+
+Collective and point-to-point requests can be mixed in functions that
+enable multiple completions. If started with two processes, the
+following program is valid.
+
+\begin{verbatim}
+MPI_Request reqs[2];
+
+switch(rank) {
+    case 0:
+      MPI_Ibarrier(comm, &reqs[0]);
+      MPI_Send(buf, count, dtype, 1, tag, comm);
+      MPI_Wait(&reqs[0], MPI_STATUS_IGNORE);
+      break;
+    case 1:
+      MPI_Irecv(buf, count, dtype, 0, tag, comm, &reqs[0]);
+      MPI_Ibarrier(comm, &reqs[1]);
+      MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE);
+      break;
+}
+\end{verbatim}
+
+The Waitall call returns only after the barrier and the receive completed.
+
+
+}
+\end{example}
+
+\begin{example} {\rm
+\label{coll-nbexN}
+\exindex{Pipelining nonblocking collective operations}
+\exindex{MPI\_Ibcast}
+\exindex{MPI\_Waitall}
+
+Multiple nonblocking collective operations can be outstanding on a
+single communicator and match in order. 
+
+\begin{verbatim}
+MPI_Request reqs[3];
+
+compute(buf1);
+MPI_Ibcast(buf1, count, type, 0, comm, &reqs[0]);
+compute(buf2);
+MPI_Ibcast(buf2, count, type, 0, comm, &reqs[1]);
+compute(buf3);
+MPI_Ibcast(buf3, count, type, 0, comm, &reqs[2]);
+MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE);
+\end{verbatim}
+
+\begin{users}
+Pipelining and double-buffering techniques can efficiently be used to
+overlap computation and communication. However, having too many
+outstanding requests might have a negative impact on performance.
+\end{users}
+
+\begin{implementors}
+The use of pipelining may generate many outstanding requests. A
+high-quality hardware-supported implementation with limited resources
+should be able to fall back to a software implementation if its
+resources are exhausted. In this way, the implementation could limit the
+number of outstanding requests only by the available memory.
+\end{implementors}
+
+}
+\end{example}
+
+
+\begin{example} {\rm
+\label{coll-nbexN}
+\exindex{Overlapping Communicators}
+\exindex{MPI\_Iallreduce}
+\exindex{MPI\_Waitall}
+
+Nonblocking collective operations can also be used to enable
+simultaneous collective operations on multiple overlapping
+communicators (see Figure~\ref{overlap_comms}). The following example is started with three processes and
+three communicators. The first communicator \verb~comm1~ includes ranks 0 and
+1, \verb~comm2~ includes ranks 1 and 2 and \verb~comm3~ spans ranks 0 and 2. It is not
+possible to perform a blocking collective operation on all communicators
+because there exists no deadlock-free order to invoke them. However,
+nonblocking collective operations can easily be used to achieve this
+task.
+
+\begin{verbatim}
+MPI_Request reqs[2];
+
+switch(rank) {
+    case 0:
+      MPI_Iallreduce(sbuf1, rbuf1, count, dtype, MPI_SUM, comm1, &reqs[0]);
+      MPI_Iallreduce(sbuf3, rbuf3, count, dtype, MPI_SUM, comm3, &reqs[1]);
+      break;
+    case 1:
+      MPI_Iallreduce(sbuf1, rbuf1, count, dtype, MPI_SUM, comm1, &reqs[0]);
+      MPI_Iallreduce(sbuf2, rbuf2, count, dtype, MPI_SUM, comm2, &reqs[1]);
+      break;
+    case 2:
+      MPI_Iallreduce(sbuf2, rbuf2, count, dtype, MPI_SUM, comm2, &reqs[0]);
+      MPI_Iallreduce(sbuf3, rbuf3, count, dtype, MPI_SUM, comm3, &reqs[1]);
+      break;
+}
+MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE);
+\end{verbatim}
+
+\begin{users}
+This method can be useful if overlapping neighboring regions (halo
+or ghost zones) are used in collective operations. The sequence of the
+two calls in each process is irrelevant because the two nonblocking
+operations are performed on different communicators.
+\end{users}
+
+\begin{figure}
+  \center
+  \includegraphics[width=2.50in]{figures/overlap_comms}
+  \small
+  \caption[Overlapping Communicators Example]{Example with overlapping
+  communicators.}
+  \label{overlap_comms}
+\end{figure}
+
+
+}
+\end{example}
+
+\begin{example} {\rm
+\label{coll-nbexN}
+\exindex{Independence of nonblocking operations}
+\exindex{MPI\_Ibcast}
+
+The progress of multiple outstanding nonblocking collective operations
+is completely independent.
+
+\begin{verbatim}
+MPI_Request reqs[2];
+
+compute(buf1);
+MPI_Ibcast(buf1, count, type, 0, comm, &reqs[0]);
+compute(buf2);
+MPI_Ibcast(buf2, count, type, 0, comm, &reqs[1]);
+MPI_Wait(&reqs[1], MPI_STATUS_IGNORE);
+/* nothing is known about the status of the first bcast here */
+MPI_Wait(&reqs[0], MPI_STATUS_IGNORE);
+\end{verbatim}
+
+Finishing the second \mpifunc{MPI\_IBCAST} is completely independent of
+the first one. This means that it is not guaranteed that the first
+broadcast operation is finished or even started after the second one is
+completed via \verb!reqs[1]!.
+
+}
+\end{example}