| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434 |
- /*
- * -- High Performance Computing Linpack Benchmark (HPL)
- * HPL - 2.0 - September 10, 2008
- * Antoine P. Petitet
- * University of Tennessee, Knoxville
- * Innovative Computing Laboratory
- * (C) Copyright 2000-2008 All Rights Reserved
- *
- * -- Copyright notice and Licensing terms:
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions, and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- * software must display the following acknowledgement:
- * This product includes software developed at the University of
- * Tennessee, Knoxville, Innovative Computing Laboratory.
- *
- * 4. The name of the University, the name of the Laboratory, or the
- * names of its contributors may not be used to endorse or promote
- * products derived from this software without specific written
- * permission.
- *
- * -- Disclaimer:
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
- * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------
- */
- /*
- * Include files
- */
- #include "hpl.h"
- #ifdef STDC_HEADERS
- void HPL_pdlaswp00T
- (
- HPL_T_panel * PBCST,
- int * IFLAG,
- HPL_T_panel * PANEL,
- const int NN
- )
- #else
- void HPL_pdlaswp00T
- ( PBCST, IFLAG, PANEL, NN )
- HPL_T_panel * PBCST;
- int * IFLAG;
- HPL_T_panel * PANEL;
- const int NN;
- #endif
- {
- /*
- * Purpose
- * =======
- *
- * HPL_pdlaswp00T applies the NB row interchanges to NN columns of the
- * trailing submatrix and broadcast a column panel.
- *
- * Bi-directional exchange is used to perform the swap :: broadcast of
- * the row panel U at once, resulting in a lower number of messages than
- * usual as well as a lower communication volume. With P process rows and
- * assuming bi-directional links, the running time of this function can
- * be approximated by:
- *
- * log_2(P) * (lat + NB*LocQ(N) / bdwth)
- *
- * where NB is the number of rows of the row panel U, N is the global
- * number of columns being updated, lat and bdwth are the latency and
- * bandwidth of the network for double precision real words. Mono
- * directional links will double this communication cost.
- *
- * Arguments
- * =========
- *
- * PBCST (local input/output) HPL_T_panel *
- * On entry, PBCST points to the data structure containing the
- * panel (to be broadcast) information.
- *
- * IFLAG (local input/output) int *
- * On entry, IFLAG indicates whether or not the broadcast has
- * already been completed. If not, probing will occur, and the
- * outcome will be contained in IFLAG on exit.
- *
- * PANEL (local input/output) HPL_T_panel *
- * On entry, PANEL points to the data structure containing the
- * panel (to be broadcast and swapped) information.
- *
- * NN (local input) const int
- * On entry, NN specifies the local number of columns of the
- * trailing submatrix to be swapped and broadcast starting at
- * the current position. NN must be at least zero.
- *
- * ---------------------------------------------------------------------
- */
- /*
- * .. Local Variables ..
- */
- MPI_Comm comm;
- HPL_T_grid * grid;
- double * A, * U, * W;
- void * vptr = NULL;
- int * ipID, * lindxA, * lindxAU, * llen,
- * llen_sv;
- unsigned int ip2, ip2_=1, ipdist, ipow=1, mask=1,
- mydist, mydis_;
- int Cmsgid=MSGID_BEGIN_PFACT, Np2, align,
- hdim, i, icurrow, *iflag, ipA, ipW, *ipl,
- iprow, jb, k, lda, ldW, myrow, n, nprow,
- partner, root, size_, usize;
- #define LDU n
- /* ..
- * .. Executable Statements ..
- */
- n = Mmin( NN, PANEL->n ); jb = PANEL->jb;
- /*
- * Quick return if there is nothing to do
- */
- if( ( n <= 0 ) || ( jb <= 0 ) ) return;
- #ifdef HPL_DETAILED_TIMING
- HPL_ptimer( HPL_TIMING_LASWP );
- #endif
- /*
- * Retrieve parameters from the PANEL data structure
- */
- grid = PANEL->grid; nprow = grid->nprow; myrow = grid->myrow;
- comm = grid->col_comm; ip2 = (unsigned int)grid->row_ip2;
- hdim = grid->row_hdim; align = PANEL->algo->align;
- A = PANEL->A; U = PANEL->U; iflag = PANEL->IWORK;
- lda = PANEL->lda; icurrow = PANEL->prow; usize = jb * n;
- ldW = n + 1;
- /*
- * Allocate space for temporary W (ldW * jb)
- */
- vptr = (void*)malloc( ( (size_t)(align) +
- ((size_t)(jb) * (size_t)(ldW))) *
- sizeof(double) );
- if( vptr == NULL )
- { HPL_pabort( __LINE__, "HPL_pdlaswp00T", "Memory allocation failed" ); }
- W = (double *)HPL_PTR( vptr, ((size_t)(align) * sizeof(double) ) );
- /*
- * Construct ipID and its local counter parts lindxA, lindxAU - llen is
- * the number of rows/columns that I have in workspace and that I should
- * send. Compute lindx_, ipA, llen if it has not already been done for
- * this panel;
- */
- k = (int)((unsigned int)(jb) << 1); ipl = iflag + 1; ipID = ipl + 1;
- lindxA = ipID + ((unsigned int)(k) << 1); lindxAU = lindxA + k;
- llen = lindxAU + k; llen_sv = llen + nprow;
- if( *iflag == -1 ) /* no index arrays have been computed so far */
- {
- HPL_pipid( PANEL, ipl, ipID );
- HPL_plindx0( PANEL, *ipl, ipID, lindxA, lindxAU, llen_sv );
- *iflag = 0;
- }
- else if( *iflag == 1 ) /* HPL_pdlaswp01T called before: reuse ipID */
- {
- HPL_plindx0( PANEL, *ipl, ipID, lindxA, lindxAU, llen_sv );
- *iflag = 0;
- }
- /*
- * Copy the llen_sv into llen - Reset ipA to its correct value
- */
- ipA = llen_sv[myrow];
- for( i = 0; i < nprow; i++ ) { llen[i] = llen_sv[i]; }
- /*
- * For i in [0..2*jb), lindxA[i] is the offset in A of a row that ulti-
- * mately goes to U( lindxAU[i], : ) or U( :, lindxAU[i] ). In icurrow,
- * we directly pack into U, otherwise we pack into workspace. The first
- * entry of each column packed in workspace is in fact the row or column
- * offset in U where it should go to.
- */
- if( myrow == icurrow )
- {
- HPL_dlaswp01T( ipA, n, A, lda, U, LDU, lindxA, lindxAU );
- }
- else
- {
- HPL_dlaswp02N( ipA, n, A, lda, W, W+1, ldW, lindxA, lindxAU );
- }
- /*
- * Probe for column panel - forward it when available
- */
- if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
- /*
- * Algorithm for bi-directional data exchange:
- *
- * As long as I have not talked to a process that already had the data
- * from icurrow, I will be sending the workspace, otherwise I will be
- * sending U. Note that the columns in workspace contain the local index
- * in U they should go to.
- *
- * If I am receiving from a process that has the data from icurrow, I
- * will be receiving in U, copy the data of U that stays into A, and
- * then the columns I have in workspace into U; otherwise I will be re-
- * ceiving in the remaining workspace. If I am one of those processes
- * that already has the data from icurrow, I will be immediately copying
- * the data I have in my workspace into U.
- *
- * When I receive U, some of U should be copied in my piece of A before
- * I can copy the rows I have in my workspace into U. This information
- * is kept in the lists lindx_: the row lindxAU[i] should be copied in
- * the row lindxA[i] of my piece of A, just as in the reversed initial
- * packing operation. Those rows are thus the first ones in the work ar-
- * ray. After this operation has been performed, I will not need
- * those lindx arrays, and I will always be sending a buffer of size
- * jb x n, or n x jb, that is, U.
- *
- * At every step of the algorithm, it is necesary to update the list
- * llen, so that I can figure out how large the next messages I will be
- * sending/receiving are. It is obvious when I am sending U. It is not
- * otherwise.
- *
- * We choose icurrow to be the source of the bi-directional exchange.
- * This allows the processes in the non-power 2 part to receive U at the
- * first exchange, and then broadcast internally this U so that those
- * processes can grab their piece of A.
- */
- if( myrow == icurrow ) { llen[myrow] = 0; ipA = 0; }
- ipW = ipA;
- Np2 = ( ( size_ = nprow - ip2 ) != 0 );
- mydist = (unsigned int)MModSub( myrow, icurrow, nprow );
- /*
- * bi-directional exchange: If nprow is not a power of 2, proc[i-ip2]
- * receives local data from proc[i] for all i in [ip2..nprow); icurrow
- * is the source, these last process indexes are relative to icurrow.
- */
- if( ( Np2 != 0 ) && ( ( partner = (int)(mydist ^ ip2) ) < nprow ) )
- {
- partner = MModAdd( icurrow, partner, nprow );
- if( mydist == 0 ) /* I am the current row: I send U and recv W */
- {
- (void) HPL_sdrv( U, usize, Cmsgid, W, llen[partner] * ldW,
- Cmsgid, partner, comm );
- if( llen[partner] > 0 )
- HPL_dlaswp03T( llen[partner], n, U, LDU, W, W+1, ldW );
- }
- else if( mydist == ip2 )
- { /* I recv U for later Bcast, I send my W */
- (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize,
- Cmsgid, partner, comm );
- }
- else /* None of us is icurrow, we exchange our Ws */
- {
- if( ( mydist & ip2 ) != 0 )
- {
- (void) HPL_send( W, llen[myrow]*ldW, partner, Cmsgid, comm );
- }
- else
- {
- (void) HPL_recv( Mptr( W, 0, ipW, ldW ), llen[partner]*ldW,
- partner, Cmsgid, comm );
- if( llen[partner] > 0 ) ipW += llen[partner];
- }
- }
- }
- /*
- * Update llen
- */
- for( i = 1; i < size_; i++ )
- {
- iprow = MModAdd( icurrow, i, nprow );
- partner = MModAdd( iprow, (int)(ip2), nprow );
- llen[ iprow ] += llen[ partner ];
- }
- /*
- * Probe for column panel - forward it when available
- */
- if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
- /*
- * power of 2 part of the processes collection: only processes [0..ip2)
- * are working; some of them (mydist >> (k+1) == 0) either send or re-
- * ceive U. At every step k, k is in [0 .. hdim), of the algorithm, a
- * process pair that exchanges U is such that (mydist >> (k+1) == 0).
- * Among those processes, the ones that are sending U are such that
- * mydist >> k == 0.
- */
- if( mydist < ip2 )
- {
- k = 0;
- while( k < hdim )
- {
- partner = (int)(mydist ^ ipow);
- partner = MModAdd( icurrow, partner, nprow );
- /*
- * Exchange and combine the local results - If I receive U, then I must
- * copy from U the rows that belong to my piece of A, and then update U
- * by copying in it the rows I have accumulated in W. Otherwise, I re-
- * ceive W. In this later case, and I have U, I shall update my copy of
- * U by copying in it the rows I have accumulated in W. If I did not
- * have U before, I simply need to update my pointer in W for later use.
- */
- if( ( mydist >> (unsigned int)( k + 1 ) ) == 0 )
- {
- if( ( mydist >> (unsigned int)(k) ) == 0 )
- {
- (void) HPL_sdrv( U, usize, Cmsgid, Mptr( W, 0, ipW,
- ldW ), llen[partner]*ldW, Cmsgid,
- partner, comm );
- HPL_dlaswp03T( llen[partner], n, U, LDU, Mptr( W, 0, ipW,
- ldW ), Mptr( W, 1, ipW, ldW ), ldW );
- ipW += llen[partner];
- }
- else
- {
- (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize,
- Cmsgid, partner, comm );
- HPL_dlaswp04T( ipA, llen[myrow], n, U, LDU, A, lda, W,
- W+1, ldW, lindxA, lindxAU );
- }
- }
- else
- {
- (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, Mptr( W, 0,
- ipW, ldW ), llen[partner]*ldW, Cmsgid,
- partner, comm );
- ipW += llen[partner];
- }
- /*
- * Update llen - Go to next process pairs
- */
- iprow = icurrow; ipdist = 0;
- do
- {
- if( (unsigned int)( partner = (int)(ipdist ^ ipow) ) > ipdist )
- {
- partner = MModAdd( icurrow, partner, nprow );
- llen[iprow] += llen[partner];
- llen[partner] = llen[iprow];
- }
- iprow = MModAdd( iprow, 1, nprow ); ipdist++;
- } while( ipdist < ip2 );
- ipow <<= 1; k++;
- /*
- * Probe for column panel - forward it when available
- */
- if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
- }
- }
- else
- {
- /*
- * non power of 2 part of the process collection: proc[ip2] broadcast U
- * to procs[ip2..nprow) (relatively to icurrow).
- */
- if( size_ > 1 )
- {
- k = size_ - 1;
- while( k > 1 ) { k >>= 1; ip2_ <<= 1; mask <<= 1; mask++; }
- root = MModAdd( icurrow, (int)(ip2), nprow );
- mydis_ = (unsigned int)MModSub( myrow, root, nprow );
- do
- {
- mask ^= ip2_;
- if( ( mydis_ & mask ) == 0 )
- {
- partner = (int)(mydis_ ^ ip2_);
- if( ( mydis_ & ip2_ ) != 0 )
- {
- (void) HPL_recv( U, usize, MModAdd( root, partner,
- nprow ), Cmsgid, comm );
- }
- else if( partner < size_ )
- {
- (void) HPL_send( U, usize, MModAdd( root, partner,
- nprow ), Cmsgid, comm );
- }
- }
- ip2_ >>= 1;
- /*
- * Probe for column panel - forward it when available
- */
- if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
- } while( ip2_ > 0 );
- }
- /*
- * Every process in [ip2..nprow) (relatively to icurrow) grabs its piece
- * of A.
- */
- HPL_dlaswp05T( ipA, n, A, lda, U, LDU, lindxA, lindxAU );
- }
- /*
- * If nprow is not a power of 2, proc[i-ip2] sends global result to
- * proc[i] for all i in [ip2..nprow);
- */
- if( ( Np2 != 0 ) && ( ( partner = (int)(mydist ^ ip2) ) < nprow ) )
- {
- partner = MModAdd( icurrow, partner, nprow );
- if( ( mydist & ip2 ) != 0 )
- { (void) HPL_recv( U, usize, partner, Cmsgid, comm ); }
- else
- { (void) HPL_send( U, usize, partner, Cmsgid, comm ); }
- }
- if( vptr ) free( vptr );
- /*
- * Probe for column panel - forward it when available
- */
- if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
- #ifdef HPL_DETAILED_TIMING
- HPL_ptimer( HPL_TIMING_LASWP );
- #endif
- /*
- * End of HPL_pdlaswp00T
- */
- }
|