HPL_pdlaswp00T.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. /*
  2. * -- High Performance Computing Linpack Benchmark (HPL)
  3. * HPL - 2.0 - September 10, 2008
  4. * Antoine P. Petitet
  5. * University of Tennessee, Knoxville
  6. * Innovative Computing Laboratory
  7. * (C) Copyright 2000-2008 All Rights Reserved
  8. *
  9. * -- Copyright notice and Licensing terms:
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. *
  15. * 1. Redistributions of source code must retain the above copyright
  16. * notice, this list of conditions and the following disclaimer.
  17. *
  18. * 2. Redistributions in binary form must reproduce the above copyright
  19. * notice, this list of conditions, and the following disclaimer in the
  20. * documentation and/or other materials provided with the distribution.
  21. *
  22. * 3. All advertising materials mentioning features or use of this
  23. * software must display the following acknowledgement:
  24. * This product includes software developed at the University of
  25. * Tennessee, Knoxville, Innovative Computing Laboratory.
  26. *
  27. * 4. The name of the University, the name of the Laboratory, or the
  28. * names of its contributors may not be used to endorse or promote
  29. * products derived from this software without specific written
  30. * permission.
  31. *
  32. * -- Disclaimer:
  33. *
  34. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35. * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
  38. * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41. * DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45. * ---------------------------------------------------------------------
  46. */
  47. /*
  48. * Include files
  49. */
  50. #include "hpl.h"
  51. #ifdef STDC_HEADERS
  52. void HPL_pdlaswp00T
  53. (
  54. HPL_T_panel * PBCST,
  55. int * IFLAG,
  56. HPL_T_panel * PANEL,
  57. const int NN
  58. )
  59. #else
  60. void HPL_pdlaswp00T
  61. ( PBCST, IFLAG, PANEL, NN )
  62. HPL_T_panel * PBCST;
  63. int * IFLAG;
  64. HPL_T_panel * PANEL;
  65. const int NN;
  66. #endif
  67. {
  68. /*
  69. * Purpose
  70. * =======
  71. *
  72. * HPL_pdlaswp00T applies the NB row interchanges to NN columns of the
  73. * trailing submatrix and broadcast a column panel.
  74. *
  75. * Bi-directional exchange is used to perform the swap :: broadcast of
  76. * the row panel U at once, resulting in a lower number of messages than
  77. * usual as well as a lower communication volume. With P process rows and
  78. * assuming bi-directional links, the running time of this function can
  79. * be approximated by:
  80. *
  81. * log_2(P) * (lat + NB*LocQ(N) / bdwth)
  82. *
  83. * where NB is the number of rows of the row panel U, N is the global
  84. * number of columns being updated, lat and bdwth are the latency and
  85. * bandwidth of the network for double precision real words. Mono
  86. * directional links will double this communication cost.
  87. *
  88. * Arguments
  89. * =========
  90. *
  91. * PBCST (local input/output) HPL_T_panel *
  92. * On entry, PBCST points to the data structure containing the
  93. * panel (to be broadcast) information.
  94. *
  95. * IFLAG (local input/output) int *
  96. * On entry, IFLAG indicates whether or not the broadcast has
  97. * already been completed. If not, probing will occur, and the
  98. * outcome will be contained in IFLAG on exit.
  99. *
  100. * PANEL (local input/output) HPL_T_panel *
  101. * On entry, PANEL points to the data structure containing the
  102. * panel (to be broadcast and swapped) information.
  103. *
  104. * NN (local input) const int
  105. * On entry, NN specifies the local number of columns of the
  106. * trailing submatrix to be swapped and broadcast starting at
  107. * the current position. NN must be at least zero.
  108. *
  109. * ---------------------------------------------------------------------
  110. */
  111. /*
  112. * .. Local Variables ..
  113. */
  114. MPI_Comm comm;
  115. HPL_T_grid * grid;
  116. double * A, * U, * W;
  117. void * vptr = NULL;
  118. int * ipID, * lindxA, * lindxAU, * llen,
  119. * llen_sv;
  120. unsigned int ip2, ip2_=1, ipdist, ipow=1, mask=1,
  121. mydist, mydis_;
  122. int Cmsgid=MSGID_BEGIN_PFACT, Np2, align,
  123. hdim, i, icurrow, *iflag, ipA, ipW, *ipl,
  124. iprow, jb, k, lda, ldW, myrow, n, nprow,
  125. partner, root, size_, usize;
  126. #define LDU n
  127. /* ..
  128. * .. Executable Statements ..
  129. */
  130. n = Mmin( NN, PANEL->n ); jb = PANEL->jb;
  131. /*
  132. * Quick return if there is nothing to do
  133. */
  134. if( ( n <= 0 ) || ( jb <= 0 ) ) return;
  135. #ifdef HPL_DETAILED_TIMING
  136. HPL_ptimer( HPL_TIMING_LASWP );
  137. #endif
  138. /*
  139. * Retrieve parameters from the PANEL data structure
  140. */
  141. grid = PANEL->grid; nprow = grid->nprow; myrow = grid->myrow;
  142. comm = grid->col_comm; ip2 = (unsigned int)grid->row_ip2;
  143. hdim = grid->row_hdim; align = PANEL->algo->align;
  144. A = PANEL->A; U = PANEL->U; iflag = PANEL->IWORK;
  145. lda = PANEL->lda; icurrow = PANEL->prow; usize = jb * n;
  146. ldW = n + 1;
  147. /*
  148. * Allocate space for temporary W (ldW * jb)
  149. */
  150. vptr = (void*)malloc( ( (size_t)(align) +
  151. ((size_t)(jb) * (size_t)(ldW))) *
  152. sizeof(double) );
  153. if( vptr == NULL )
  154. { HPL_pabort( __LINE__, "HPL_pdlaswp00T", "Memory allocation failed" ); }
  155. W = (double *)HPL_PTR( vptr, ((size_t)(align) * sizeof(double) ) );
  156. /*
  157. * Construct ipID and its local counter parts lindxA, lindxAU - llen is
  158. * the number of rows/columns that I have in workspace and that I should
  159. * send. Compute lindx_, ipA, llen if it has not already been done for
  160. * this panel;
  161. */
  162. k = (int)((unsigned int)(jb) << 1); ipl = iflag + 1; ipID = ipl + 1;
  163. lindxA = ipID + ((unsigned int)(k) << 1); lindxAU = lindxA + k;
  164. llen = lindxAU + k; llen_sv = llen + nprow;
  165. if( *iflag == -1 ) /* no index arrays have been computed so far */
  166. {
  167. HPL_pipid( PANEL, ipl, ipID );
  168. HPL_plindx0( PANEL, *ipl, ipID, lindxA, lindxAU, llen_sv );
  169. *iflag = 0;
  170. }
  171. else if( *iflag == 1 ) /* HPL_pdlaswp01T called before: reuse ipID */
  172. {
  173. HPL_plindx0( PANEL, *ipl, ipID, lindxA, lindxAU, llen_sv );
  174. *iflag = 0;
  175. }
  176. /*
  177. * Copy the llen_sv into llen - Reset ipA to its correct value
  178. */
  179. ipA = llen_sv[myrow];
  180. for( i = 0; i < nprow; i++ ) { llen[i] = llen_sv[i]; }
  181. /*
  182. * For i in [0..2*jb), lindxA[i] is the offset in A of a row that ulti-
  183. * mately goes to U( lindxAU[i], : ) or U( :, lindxAU[i] ). In icurrow,
  184. * we directly pack into U, otherwise we pack into workspace. The first
  185. * entry of each column packed in workspace is in fact the row or column
  186. * offset in U where it should go to.
  187. */
  188. if( myrow == icurrow )
  189. {
  190. HPL_dlaswp01T( ipA, n, A, lda, U, LDU, lindxA, lindxAU );
  191. }
  192. else
  193. {
  194. HPL_dlaswp02N( ipA, n, A, lda, W, W+1, ldW, lindxA, lindxAU );
  195. }
  196. /*
  197. * Probe for column panel - forward it when available
  198. */
  199. if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
  200. /*
  201. * Algorithm for bi-directional data exchange:
  202. *
  203. * As long as I have not talked to a process that already had the data
  204. * from icurrow, I will be sending the workspace, otherwise I will be
  205. * sending U. Note that the columns in workspace contain the local index
  206. * in U they should go to.
  207. *
  208. * If I am receiving from a process that has the data from icurrow, I
  209. * will be receiving in U, copy the data of U that stays into A, and
  210. * then the columns I have in workspace into U; otherwise I will be re-
  211. * ceiving in the remaining workspace. If I am one of those processes
  212. * that already has the data from icurrow, I will be immediately copying
  213. * the data I have in my workspace into U.
  214. *
  215. * When I receive U, some of U should be copied in my piece of A before
  216. * I can copy the rows I have in my workspace into U. This information
  217. * is kept in the lists lindx_: the row lindxAU[i] should be copied in
  218. * the row lindxA[i] of my piece of A, just as in the reversed initial
  219. * packing operation. Those rows are thus the first ones in the work ar-
  220. * ray. After this operation has been performed, I will not need
  221. * those lindx arrays, and I will always be sending a buffer of size
  222. * jb x n, or n x jb, that is, U.
  223. *
  224. * At every step of the algorithm, it is necesary to update the list
  225. * llen, so that I can figure out how large the next messages I will be
  226. * sending/receiving are. It is obvious when I am sending U. It is not
  227. * otherwise.
  228. *
  229. * We choose icurrow to be the source of the bi-directional exchange.
  230. * This allows the processes in the non-power 2 part to receive U at the
  231. * first exchange, and then broadcast internally this U so that those
  232. * processes can grab their piece of A.
  233. */
  234. if( myrow == icurrow ) { llen[myrow] = 0; ipA = 0; }
  235. ipW = ipA;
  236. Np2 = ( ( size_ = nprow - ip2 ) != 0 );
  237. mydist = (unsigned int)MModSub( myrow, icurrow, nprow );
  238. /*
  239. * bi-directional exchange: If nprow is not a power of 2, proc[i-ip2]
  240. * receives local data from proc[i] for all i in [ip2..nprow); icurrow
  241. * is the source, these last process indexes are relative to icurrow.
  242. */
  243. if( ( Np2 != 0 ) && ( ( partner = (int)(mydist ^ ip2) ) < nprow ) )
  244. {
  245. partner = MModAdd( icurrow, partner, nprow );
  246. if( mydist == 0 ) /* I am the current row: I send U and recv W */
  247. {
  248. (void) HPL_sdrv( U, usize, Cmsgid, W, llen[partner] * ldW,
  249. Cmsgid, partner, comm );
  250. if( llen[partner] > 0 )
  251. HPL_dlaswp03T( llen[partner], n, U, LDU, W, W+1, ldW );
  252. }
  253. else if( mydist == ip2 )
  254. { /* I recv U for later Bcast, I send my W */
  255. (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize,
  256. Cmsgid, partner, comm );
  257. }
  258. else /* None of us is icurrow, we exchange our Ws */
  259. {
  260. if( ( mydist & ip2 ) != 0 )
  261. {
  262. (void) HPL_send( W, llen[myrow]*ldW, partner, Cmsgid, comm );
  263. }
  264. else
  265. {
  266. (void) HPL_recv( Mptr( W, 0, ipW, ldW ), llen[partner]*ldW,
  267. partner, Cmsgid, comm );
  268. if( llen[partner] > 0 ) ipW += llen[partner];
  269. }
  270. }
  271. }
  272. /*
  273. * Update llen
  274. */
  275. for( i = 1; i < size_; i++ )
  276. {
  277. iprow = MModAdd( icurrow, i, nprow );
  278. partner = MModAdd( iprow, (int)(ip2), nprow );
  279. llen[ iprow ] += llen[ partner ];
  280. }
  281. /*
  282. * Probe for column panel - forward it when available
  283. */
  284. if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
  285. /*
  286. * power of 2 part of the processes collection: only processes [0..ip2)
  287. * are working; some of them (mydist >> (k+1) == 0) either send or re-
  288. * ceive U. At every step k, k is in [0 .. hdim), of the algorithm, a
  289. * process pair that exchanges U is such that (mydist >> (k+1) == 0).
  290. * Among those processes, the ones that are sending U are such that
  291. * mydist >> k == 0.
  292. */
  293. if( mydist < ip2 )
  294. {
  295. k = 0;
  296. while( k < hdim )
  297. {
  298. partner = (int)(mydist ^ ipow);
  299. partner = MModAdd( icurrow, partner, nprow );
  300. /*
  301. * Exchange and combine the local results - If I receive U, then I must
  302. * copy from U the rows that belong to my piece of A, and then update U
  303. * by copying in it the rows I have accumulated in W. Otherwise, I re-
  304. * ceive W. In this later case, and I have U, I shall update my copy of
  305. * U by copying in it the rows I have accumulated in W. If I did not
  306. * have U before, I simply need to update my pointer in W for later use.
  307. */
  308. if( ( mydist >> (unsigned int)( k + 1 ) ) == 0 )
  309. {
  310. if( ( mydist >> (unsigned int)(k) ) == 0 )
  311. {
  312. (void) HPL_sdrv( U, usize, Cmsgid, Mptr( W, 0, ipW,
  313. ldW ), llen[partner]*ldW, Cmsgid,
  314. partner, comm );
  315. HPL_dlaswp03T( llen[partner], n, U, LDU, Mptr( W, 0, ipW,
  316. ldW ), Mptr( W, 1, ipW, ldW ), ldW );
  317. ipW += llen[partner];
  318. }
  319. else
  320. {
  321. (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize,
  322. Cmsgid, partner, comm );
  323. HPL_dlaswp04T( ipA, llen[myrow], n, U, LDU, A, lda, W,
  324. W+1, ldW, lindxA, lindxAU );
  325. }
  326. }
  327. else
  328. {
  329. (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, Mptr( W, 0,
  330. ipW, ldW ), llen[partner]*ldW, Cmsgid,
  331. partner, comm );
  332. ipW += llen[partner];
  333. }
  334. /*
  335. * Update llen - Go to next process pairs
  336. */
  337. iprow = icurrow; ipdist = 0;
  338. do
  339. {
  340. if( (unsigned int)( partner = (int)(ipdist ^ ipow) ) > ipdist )
  341. {
  342. partner = MModAdd( icurrow, partner, nprow );
  343. llen[iprow] += llen[partner];
  344. llen[partner] = llen[iprow];
  345. }
  346. iprow = MModAdd( iprow, 1, nprow ); ipdist++;
  347. } while( ipdist < ip2 );
  348. ipow <<= 1; k++;
  349. /*
  350. * Probe for column panel - forward it when available
  351. */
  352. if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
  353. }
  354. }
  355. else
  356. {
  357. /*
  358. * non power of 2 part of the process collection: proc[ip2] broadcast U
  359. * to procs[ip2..nprow) (relatively to icurrow).
  360. */
  361. if( size_ > 1 )
  362. {
  363. k = size_ - 1;
  364. while( k > 1 ) { k >>= 1; ip2_ <<= 1; mask <<= 1; mask++; }
  365. root = MModAdd( icurrow, (int)(ip2), nprow );
  366. mydis_ = (unsigned int)MModSub( myrow, root, nprow );
  367. do
  368. {
  369. mask ^= ip2_;
  370. if( ( mydis_ & mask ) == 0 )
  371. {
  372. partner = (int)(mydis_ ^ ip2_);
  373. if( ( mydis_ & ip2_ ) != 0 )
  374. {
  375. (void) HPL_recv( U, usize, MModAdd( root, partner,
  376. nprow ), Cmsgid, comm );
  377. }
  378. else if( partner < size_ )
  379. {
  380. (void) HPL_send( U, usize, MModAdd( root, partner,
  381. nprow ), Cmsgid, comm );
  382. }
  383. }
  384. ip2_ >>= 1;
  385. /*
  386. * Probe for column panel - forward it when available
  387. */
  388. if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
  389. } while( ip2_ > 0 );
  390. }
  391. /*
  392. * Every process in [ip2..nprow) (relatively to icurrow) grabs its piece
  393. * of A.
  394. */
  395. HPL_dlaswp05T( ipA, n, A, lda, U, LDU, lindxA, lindxAU );
  396. }
  397. /*
  398. * If nprow is not a power of 2, proc[i-ip2] sends global result to
  399. * proc[i] for all i in [ip2..nprow);
  400. */
  401. if( ( Np2 != 0 ) && ( ( partner = (int)(mydist ^ ip2) ) < nprow ) )
  402. {
  403. partner = MModAdd( icurrow, partner, nprow );
  404. if( ( mydist & ip2 ) != 0 )
  405. { (void) HPL_recv( U, usize, partner, Cmsgid, comm ); }
  406. else
  407. { (void) HPL_send( U, usize, partner, Cmsgid, comm ); }
  408. }
  409. if( vptr ) free( vptr );
  410. /*
  411. * Probe for column panel - forward it when available
  412. */
  413. if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
  414. #ifdef HPL_DETAILED_TIMING
  415. HPL_ptimer( HPL_TIMING_LASWP );
  416. #endif
  417. /*
  418. * End of HPL_pdlaswp00T
  419. */
  420. }