/* * -- High Performance Computing Linpack Benchmark (HPL) * HPL - 2.0 - September 10, 2008 * Antoine P. Petitet * University of Tennessee, Knoxville * Innovative Computing Laboratory * (C) Copyright 2000-2008 All Rights Reserved * * -- Copyright notice and Licensing terms: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. All advertising materials mentioning features or use of this * software must display the following acknowledgement: * This product includes software developed at the University of * Tennessee, Knoxville, Innovative Computing Laboratory. * * 4. The name of the University, the name of the Laboratory, or the * names of its contributors may not be used to endorse or promote * products derived from this software without specific written * permission. * * -- Disclaimer: * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * --------------------------------------------------------------------- */ /* * Include files */ #include "hpl.h" #ifdef STDC_HEADERS void HPL_pdlaswp00T ( HPL_T_panel * PBCST, int * IFLAG, HPL_T_panel * PANEL, const int NN ) #else void HPL_pdlaswp00T ( PBCST, IFLAG, PANEL, NN ) HPL_T_panel * PBCST; int * IFLAG; HPL_T_panel * PANEL; const int NN; #endif { /* * Purpose * ======= * * HPL_pdlaswp00T applies the NB row interchanges to NN columns of the * trailing submatrix and broadcast a column panel. * * Bi-directional exchange is used to perform the swap :: broadcast of * the row panel U at once, resulting in a lower number of messages than * usual as well as a lower communication volume. With P process rows and * assuming bi-directional links, the running time of this function can * be approximated by: * * log_2(P) * (lat + NB*LocQ(N) / bdwth) * * where NB is the number of rows of the row panel U, N is the global * number of columns being updated, lat and bdwth are the latency and * bandwidth of the network for double precision real words. Mono * directional links will double this communication cost. * * Arguments * ========= * * PBCST (local input/output) HPL_T_panel * * On entry, PBCST points to the data structure containing the * panel (to be broadcast) information. * * IFLAG (local input/output) int * * On entry, IFLAG indicates whether or not the broadcast has * already been completed. If not, probing will occur, and the * outcome will be contained in IFLAG on exit. * * PANEL (local input/output) HPL_T_panel * * On entry, PANEL points to the data structure containing the * panel (to be broadcast and swapped) information. * * NN (local input) const int * On entry, NN specifies the local number of columns of the * trailing submatrix to be swapped and broadcast starting at * the current position. NN must be at least zero. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ MPI_Comm comm; HPL_T_grid * grid; double * A, * U, * W; void * vptr = NULL; int * ipID, * lindxA, * lindxAU, * llen, * llen_sv; unsigned int ip2, ip2_=1, ipdist, ipow=1, mask=1, mydist, mydis_; int Cmsgid=MSGID_BEGIN_PFACT, Np2, align, hdim, i, icurrow, *iflag, ipA, ipW, *ipl, iprow, jb, k, lda, ldW, myrow, n, nprow, partner, root, size_, usize; #define LDU n /* .. * .. Executable Statements .. */ n = Mmin( NN, PANEL->n ); jb = PANEL->jb; /* * Quick return if there is nothing to do */ if( ( n <= 0 ) || ( jb <= 0 ) ) return; #ifdef HPL_DETAILED_TIMING HPL_ptimer( HPL_TIMING_LASWP ); #endif /* * Retrieve parameters from the PANEL data structure */ grid = PANEL->grid; nprow = grid->nprow; myrow = grid->myrow; comm = grid->col_comm; ip2 = (unsigned int)grid->row_ip2; hdim = grid->row_hdim; align = PANEL->algo->align; A = PANEL->A; U = PANEL->U; iflag = PANEL->IWORK; lda = PANEL->lda; icurrow = PANEL->prow; usize = jb * n; ldW = n + 1; /* * Allocate space for temporary W (ldW * jb) */ vptr = (void*)malloc( ( (size_t)(align) + ((size_t)(jb) * (size_t)(ldW))) * sizeof(double) ); if( vptr == NULL ) { HPL_pabort( __LINE__, "HPL_pdlaswp00T", "Memory allocation failed" ); } W = (double *)HPL_PTR( vptr, ((size_t)(align) * sizeof(double) ) ); /* * Construct ipID and its local counter parts lindxA, lindxAU - llen is * the number of rows/columns that I have in workspace and that I should * send. Compute lindx_, ipA, llen if it has not already been done for * this panel; */ k = (int)((unsigned int)(jb) << 1); ipl = iflag + 1; ipID = ipl + 1; lindxA = ipID + ((unsigned int)(k) << 1); lindxAU = lindxA + k; llen = lindxAU + k; llen_sv = llen + nprow; if( *iflag == -1 ) /* no index arrays have been computed so far */ { HPL_pipid( PANEL, ipl, ipID ); HPL_plindx0( PANEL, *ipl, ipID, lindxA, lindxAU, llen_sv ); *iflag = 0; } else if( *iflag == 1 ) /* HPL_pdlaswp01T called before: reuse ipID */ { HPL_plindx0( PANEL, *ipl, ipID, lindxA, lindxAU, llen_sv ); *iflag = 0; } /* * Copy the llen_sv into llen - Reset ipA to its correct value */ ipA = llen_sv[myrow]; for( i = 0; i < nprow; i++ ) { llen[i] = llen_sv[i]; } /* * For i in [0..2*jb), lindxA[i] is the offset in A of a row that ulti- * mately goes to U( lindxAU[i], : ) or U( :, lindxAU[i] ). In icurrow, * we directly pack into U, otherwise we pack into workspace. The first * entry of each column packed in workspace is in fact the row or column * offset in U where it should go to. */ if( myrow == icurrow ) { HPL_dlaswp01T( ipA, n, A, lda, U, LDU, lindxA, lindxAU ); } else { HPL_dlaswp02N( ipA, n, A, lda, W, W+1, ldW, lindxA, lindxAU ); } /* * Probe for column panel - forward it when available */ if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG ); /* * Algorithm for bi-directional data exchange: * * As long as I have not talked to a process that already had the data * from icurrow, I will be sending the workspace, otherwise I will be * sending U. Note that the columns in workspace contain the local index * in U they should go to. * * If I am receiving from a process that has the data from icurrow, I * will be receiving in U, copy the data of U that stays into A, and * then the columns I have in workspace into U; otherwise I will be re- * ceiving in the remaining workspace. If I am one of those processes * that already has the data from icurrow, I will be immediately copying * the data I have in my workspace into U. * * When I receive U, some of U should be copied in my piece of A before * I can copy the rows I have in my workspace into U. This information * is kept in the lists lindx_: the row lindxAU[i] should be copied in * the row lindxA[i] of my piece of A, just as in the reversed initial * packing operation. Those rows are thus the first ones in the work ar- * ray. After this operation has been performed, I will not need * those lindx arrays, and I will always be sending a buffer of size * jb x n, or n x jb, that is, U. * * At every step of the algorithm, it is necesary to update the list * llen, so that I can figure out how large the next messages I will be * sending/receiving are. It is obvious when I am sending U. It is not * otherwise. * * We choose icurrow to be the source of the bi-directional exchange. * This allows the processes in the non-power 2 part to receive U at the * first exchange, and then broadcast internally this U so that those * processes can grab their piece of A. */ if( myrow == icurrow ) { llen[myrow] = 0; ipA = 0; } ipW = ipA; Np2 = ( ( size_ = nprow - ip2 ) != 0 ); mydist = (unsigned int)MModSub( myrow, icurrow, nprow ); /* * bi-directional exchange: If nprow is not a power of 2, proc[i-ip2] * receives local data from proc[i] for all i in [ip2..nprow); icurrow * is the source, these last process indexes are relative to icurrow. */ if( ( Np2 != 0 ) && ( ( partner = (int)(mydist ^ ip2) ) < nprow ) ) { partner = MModAdd( icurrow, partner, nprow ); if( mydist == 0 ) /* I am the current row: I send U and recv W */ { (void) HPL_sdrv( U, usize, Cmsgid, W, llen[partner] * ldW, Cmsgid, partner, comm ); if( llen[partner] > 0 ) HPL_dlaswp03T( llen[partner], n, U, LDU, W, W+1, ldW ); } else if( mydist == ip2 ) { /* I recv U for later Bcast, I send my W */ (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize, Cmsgid, partner, comm ); } else /* None of us is icurrow, we exchange our Ws */ { if( ( mydist & ip2 ) != 0 ) { (void) HPL_send( W, llen[myrow]*ldW, partner, Cmsgid, comm ); } else { (void) HPL_recv( Mptr( W, 0, ipW, ldW ), llen[partner]*ldW, partner, Cmsgid, comm ); if( llen[partner] > 0 ) ipW += llen[partner]; } } } /* * Update llen */ for( i = 1; i < size_; i++ ) { iprow = MModAdd( icurrow, i, nprow ); partner = MModAdd( iprow, (int)(ip2), nprow ); llen[ iprow ] += llen[ partner ]; } /* * Probe for column panel - forward it when available */ if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG ); /* * power of 2 part of the processes collection: only processes [0..ip2) * are working; some of them (mydist >> (k+1) == 0) either send or re- * ceive U. At every step k, k is in [0 .. hdim), of the algorithm, a * process pair that exchanges U is such that (mydist >> (k+1) == 0). * Among those processes, the ones that are sending U are such that * mydist >> k == 0. */ if( mydist < ip2 ) { k = 0; while( k < hdim ) { partner = (int)(mydist ^ ipow); partner = MModAdd( icurrow, partner, nprow ); /* * Exchange and combine the local results - If I receive U, then I must * copy from U the rows that belong to my piece of A, and then update U * by copying in it the rows I have accumulated in W. Otherwise, I re- * ceive W. In this later case, and I have U, I shall update my copy of * U by copying in it the rows I have accumulated in W. If I did not * have U before, I simply need to update my pointer in W for later use. */ if( ( mydist >> (unsigned int)( k + 1 ) ) == 0 ) { if( ( mydist >> (unsigned int)(k) ) == 0 ) { (void) HPL_sdrv( U, usize, Cmsgid, Mptr( W, 0, ipW, ldW ), llen[partner]*ldW, Cmsgid, partner, comm ); HPL_dlaswp03T( llen[partner], n, U, LDU, Mptr( W, 0, ipW, ldW ), Mptr( W, 1, ipW, ldW ), ldW ); ipW += llen[partner]; } else { (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize, Cmsgid, partner, comm ); HPL_dlaswp04T( ipA, llen[myrow], n, U, LDU, A, lda, W, W+1, ldW, lindxA, lindxAU ); } } else { (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, Mptr( W, 0, ipW, ldW ), llen[partner]*ldW, Cmsgid, partner, comm ); ipW += llen[partner]; } /* * Update llen - Go to next process pairs */ iprow = icurrow; ipdist = 0; do { if( (unsigned int)( partner = (int)(ipdist ^ ipow) ) > ipdist ) { partner = MModAdd( icurrow, partner, nprow ); llen[iprow] += llen[partner]; llen[partner] = llen[iprow]; } iprow = MModAdd( iprow, 1, nprow ); ipdist++; } while( ipdist < ip2 ); ipow <<= 1; k++; /* * Probe for column panel - forward it when available */ if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG ); } } else { /* * non power of 2 part of the process collection: proc[ip2] broadcast U * to procs[ip2..nprow) (relatively to icurrow). */ if( size_ > 1 ) { k = size_ - 1; while( k > 1 ) { k >>= 1; ip2_ <<= 1; mask <<= 1; mask++; } root = MModAdd( icurrow, (int)(ip2), nprow ); mydis_ = (unsigned int)MModSub( myrow, root, nprow ); do { mask ^= ip2_; if( ( mydis_ & mask ) == 0 ) { partner = (int)(mydis_ ^ ip2_); if( ( mydis_ & ip2_ ) != 0 ) { (void) HPL_recv( U, usize, MModAdd( root, partner, nprow ), Cmsgid, comm ); } else if( partner < size_ ) { (void) HPL_send( U, usize, MModAdd( root, partner, nprow ), Cmsgid, comm ); } } ip2_ >>= 1; /* * Probe for column panel - forward it when available */ if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG ); } while( ip2_ > 0 ); } /* * Every process in [ip2..nprow) (relatively to icurrow) grabs its piece * of A. */ HPL_dlaswp05T( ipA, n, A, lda, U, LDU, lindxA, lindxAU ); } /* * If nprow is not a power of 2, proc[i-ip2] sends global result to * proc[i] for all i in [ip2..nprow); */ if( ( Np2 != 0 ) && ( ( partner = (int)(mydist ^ ip2) ) < nprow ) ) { partner = MModAdd( icurrow, partner, nprow ); if( ( mydist & ip2 ) != 0 ) { (void) HPL_recv( U, usize, partner, Cmsgid, comm ); } else { (void) HPL_send( U, usize, partner, Cmsgid, comm ); } } if( vptr ) free( vptr ); /* * Probe for column panel - forward it when available */ if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG ); #ifdef HPL_DETAILED_TIMING HPL_ptimer( HPL_TIMING_LASWP ); #endif /* * End of HPL_pdlaswp00T */ }