HPL_pdtest.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. /*
  2. * -- High Performance Computing Linpack Benchmark (HPL)
  3. * HPL - 2.0 - September 10, 2008
  4. * Antoine P. Petitet
  5. * University of Tennessee, Knoxville
  6. * Innovative Computing Laboratory
  7. * (C) Copyright 2000-2008 All Rights Reserved
  8. *
  9. * -- Copyright notice and Licensing terms:
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. *
  15. * 1. Redistributions of source code must retain the above copyright
  16. * notice, this list of conditions and the following disclaimer.
  17. *
  18. * 2. Redistributions in binary form must reproduce the above copyright
  19. * notice, this list of conditions, and the following disclaimer in the
  20. * documentation and/or other materials provided with the distribution.
  21. *
  22. * 3. All advertising materials mentioning features or use of this
  23. * software must display the following acknowledgement:
  24. * This product includes software developed at the University of
  25. * Tennessee, Knoxville, Innovative Computing Laboratory.
  26. *
  27. * 4. The name of the University, the name of the Laboratory, or the
  28. * names of its contributors may not be used to endorse or promote
  29. * products derived from this software without specific written
  30. * permission.
  31. *
  32. * -- Disclaimer:
  33. *
  34. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35. * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
  38. * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41. * DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45. * ---------------------------------------------------------------------
  46. */
  47. /*
  48. * Include files
  49. */
  50. #include "hpl.h"
  51. #ifdef STDC_HEADERS
  52. void HPL_pdtest
  53. (
  54. HPL_T_test * TEST,
  55. HPL_T_grid * GRID,
  56. HPL_T_palg * ALGO,
  57. const int N,
  58. const int NB
  59. )
  60. #else
  61. void HPL_pdtest
  62. ( TEST, GRID, ALGO, N, NB )
  63. HPL_T_test * TEST;
  64. HPL_T_grid * GRID;
  65. HPL_T_palg * ALGO;
  66. const int N;
  67. const int NB;
  68. #endif
  69. {
  70. /*
  71. * Purpose
  72. * =======
  73. *
  74. * HPL_pdtest performs one test given a set of parameters such as the
  75. * process grid, the problem size, the distribution blocking factor ...
  76. * This function generates the data, calls and times the linear system
  77. * solver, checks the accuracy of the obtained vector solution and
  78. * writes this information to the file pointed to by TEST->outfp.
  79. *
  80. * Arguments
  81. * =========
  82. *
  83. * TEST (global input) HPL_T_test *
  84. * On entry, TEST points to a testing data structure: outfp
  85. * specifies the output file where the results will be printed.
  86. * It is only defined and used by the process 0 of the grid.
  87. * thrsh specifies the threshhold value for the test ratio.
  88. * Concretely, a test is declared "PASSED" if and only if the
  89. * following inequality is satisfied:
  90. * ||Ax-b||_oo / ( epsil *
  91. * ( || x ||_oo * || A ||_oo + || b ||_oo ) *
  92. * N ) < thrsh.
  93. * epsil is the relative machine precision of the distributed
  94. * computer. Finally the test counters, kfail, kpass, kskip and
  95. * ktest are updated as follows: if the test passes, kpass is
  96. * incremented by one; if the test fails, kfail is incremented
  97. * by one; if the test is skipped, kskip is incremented by one.
  98. * ktest is left unchanged.
  99. *
  100. * GRID (local input) HPL_T_grid *
  101. * On entry, GRID points to the data structure containing the
  102. * process grid information.
  103. *
  104. * ALGO (global input) HPL_T_palg *
  105. * On entry, ALGO points to the data structure containing the
  106. * algorithmic parameters to be used for this test.
  107. *
  108. * N (global input) const int
  109. * On entry, N specifies the order of the coefficient matrix A.
  110. * N must be at least zero.
  111. *
  112. * NB (global input) const int
  113. * On entry, NB specifies the blocking factor used to partition
  114. * and distribute the matrix A. NB must be larger than one.
  115. *
  116. * ---------------------------------------------------------------------
  117. */
  118. /*
  119. * .. Local Variables ..
  120. */
  121. #ifdef HPL_DETAILED_TIMING
  122. double HPL_w[HPL_TIMING_N];
  123. #endif
  124. HPL_T_pmat mat;
  125. double wtime[1];
  126. int info[3];
  127. double Anorm1, AnormI, Gflops, Xnorm1, XnormI,
  128. BnormI, resid0, resid1;
  129. double * Bptr;
  130. void * vptr = NULL;
  131. static int first=1;
  132. int ii, ip2, mycol, myrow, npcol, nprow, nq;
  133. char ctop, cpfact, crfact;
  134. /* ..
  135. * .. Executable Statements ..
  136. */
  137. (void) HPL_grid_info( GRID, &nprow, &npcol, &myrow, &mycol );
  138. mat.n = N; mat.nb = NB; mat.info = 0;
  139. mat.mp = HPL_numroc( N, NB, NB, myrow, 0, nprow );
  140. nq = HPL_numroc( N, NB, NB, mycol, 0, npcol );
  141. mat.nq = nq + 1;
  142. /*
  143. * Allocate matrix, right-hand-side, and vector solution x. [ A | b ] is
  144. * N by N+1. One column is added in every process column for the solve.
  145. * The result however is stored in a 1 x N vector replicated in every
  146. * process row. In every process, A is lda * (nq+1), x is 1 * nq and the
  147. * workspace is mp.
  148. *
  149. * Ensure that lda is a multiple of ALIGN and not a power of 2
  150. */
  151. mat.ld = ( ( Mmax( 1, mat.mp ) - 1 ) / ALGO->align ) * ALGO->align;
  152. do
  153. {
  154. ii = ( mat.ld += ALGO->align ); ip2 = 1;
  155. while( ii > 1 ) { ii >>= 1; ip2 <<= 1; }
  156. }
  157. while( mat.ld == ip2 );
  158. /*
  159. * Allocate dynamic memory
  160. */
  161. vptr = (void*)malloc( ( (size_t)(ALGO->align) +
  162. (size_t)(mat.ld+1) * (size_t)(mat.nq) ) *
  163. sizeof(double) );
  164. info[0] = (vptr == NULL); info[1] = myrow; info[2] = mycol;
  165. (void) HPL_all_reduce( (void *)(info), 3, HPL_INT, HPL_max,
  166. GRID->all_comm );
  167. if( info[0] != 0 )
  168. {
  169. if( ( myrow == 0 ) && ( mycol == 0 ) )
  170. HPL_pwarn( TEST->outfp, __LINE__, "HPL_pdtest",
  171. "[%d,%d] %s", info[1], info[2],
  172. "Memory allocation failed for A, x and b. Skip." );
  173. (TEST->kskip)++;
  174. return;
  175. }
  176. /*
  177. * generate matrix and right-hand-side, [ A | b ] which is N by N+1.
  178. */
  179. mat.A = (double *)HPL_PTR( vptr,
  180. ((size_t)(ALGO->align) * sizeof(double) ) );
  181. mat.X = Mptr( mat.A, 0, mat.nq, mat.ld );
  182. HPL_pdmatgen( GRID, N, N+1, NB, mat.A, mat.ld, HPL_ISEED );
  183. #ifdef HPL_CALL_VSIPL
  184. mat.block = vsip_blockbind_d( (vsip_scalar_d *)(mat.A),
  185. (vsip_length)(mat.ld * mat.nq),
  186. VSIP_MEM_NONE );
  187. #endif
  188. /*
  189. * Solve linear system
  190. */
  191. HPL_ptimer_boot(); (void) HPL_barrier( GRID->all_comm );
  192. HPL_ptimer( 0 );
  193. HPL_pdgesv( GRID, ALGO, &mat );
  194. HPL_ptimer( 0 );
  195. #ifdef HPL_CALL_VSIPL
  196. (void) vsip_blockrelease_d( mat.block, VSIP_TRUE );
  197. vsip_blockdestroy_d( mat.block );
  198. #endif
  199. /*
  200. * Gather max of all CPU and WALL clock timings and print timing results
  201. */
  202. HPL_ptimer_combine( GRID->all_comm, HPL_AMAX_PTIME, HPL_WALL_PTIME,
  203. 1, 0, wtime );
  204. if( ( myrow == 0 ) && ( mycol == 0 ) )
  205. {
  206. if( first )
  207. {
  208. HPL_fprintf( TEST->outfp, "%s%s\n",
  209. "========================================",
  210. "========================================" );
  211. HPL_fprintf( TEST->outfp, "%s%s\n",
  212. "T/V N NB P Q",
  213. " Time Gflops" );
  214. HPL_fprintf( TEST->outfp, "%s%s\n",
  215. "----------------------------------------",
  216. "----------------------------------------" );
  217. if( TEST->thrsh <= HPL_rzero ) first = 0;
  218. }
  219. /*
  220. * 2/3 N^3 - 1/2 N^2 flops for LU factorization + 2 N^2 flops for solve.
  221. * Print WALL time
  222. */
  223. Gflops = ( ( (double)(N) / 1.0e+9 ) *
  224. ( (double)(N) / wtime[0] ) ) *
  225. ( ( 2.0 / 3.0 ) * (double)(N) + ( 3.0 / 2.0 ) );
  226. // FILE *perf_file;
  227. char name[50] = "/shared/DEMOS/RCCE/LINPACK/perf.";
  228. // char name[50] = "perf.";
  229. char postfix[50];
  230. sprintf(postfix, "%d", RCCE_num_ues());
  231. strcat(name, postfix);
  232. printf("Core %d opens file %s and writes %d\n", RCCE_ue(), name, (int)(1000*Gflops));
  233. // perf_file = fopen(name,"w");
  234. // fprintf(perf_file, "%d", (int)(1000*Gflops));
  235. // fclose(perf_file);
  236. cpfact = ( ( (HPL_T_FACT)(ALGO->pfact) ==
  237. (HPL_T_FACT)(HPL_LEFT_LOOKING) ) ? (char)('L') :
  238. ( ( (HPL_T_FACT)(ALGO->pfact) == (HPL_T_FACT)(HPL_CROUT) ) ?
  239. (char)('C') : (char)('R') ) );
  240. crfact = ( ( (HPL_T_FACT)(ALGO->rfact) ==
  241. (HPL_T_FACT)(HPL_LEFT_LOOKING) ) ? (char)('L') :
  242. ( ( (HPL_T_FACT)(ALGO->rfact) == (HPL_T_FACT)(HPL_CROUT) ) ?
  243. (char)('C') : (char)('R') ) );
  244. if( ALGO->btopo == HPL_1RING ) ctop = '0';
  245. else if( ALGO->btopo == HPL_1RING_M ) ctop = '1';
  246. else if( ALGO->btopo == HPL_2RING ) ctop = '2';
  247. else if( ALGO->btopo == HPL_2RING_M ) ctop = '3';
  248. else if( ALGO->btopo == HPL_BLONG ) ctop = '4';
  249. else /* if( ALGO->btopo == HPL_BLONG_M ) */ ctop = '5';
  250. if( wtime[0] > HPL_rzero )
  251. HPL_fprintf( TEST->outfp,
  252. "W%c%1d%c%c%1d%c%1d%12d %5d %5d %5d %18.4f %18.3e\n",
  253. ( GRID->order == HPL_ROW_MAJOR ? 'R' : 'C' ),
  254. ALGO->depth, ctop, crfact, ALGO->nbdiv, cpfact, ALGO->nbmin,
  255. N, NB, nprow, npcol, wtime[0], Gflops );
  256. }
  257. #ifdef HPL_DETAILED_TIMING
  258. HPL_ptimer_combine( GRID->all_comm, HPL_AMAX_PTIME, HPL_WALL_PTIME,
  259. HPL_TIMING_N, HPL_TIMING_BEG, HPL_w );
  260. if( ( myrow == 0 ) && ( mycol == 0 ) )
  261. {
  262. HPL_fprintf( TEST->outfp, "%s%s\n",
  263. "--VVV--VVV--VVV--VVV--VVV--VVV--VVV--V",
  264. "VV--VVV--VVV--VVV--VVV--VVV--VVV--VVV-" );
  265. /*
  266. * Recursive panel factorization
  267. */
  268. if( HPL_w[HPL_TIMING_RPFACT-HPL_TIMING_BEG] > HPL_rzero )
  269. HPL_fprintf( TEST->outfp,
  270. "Max aggregated wall time rfact . . . : %18.2f\n",
  271. HPL_w[HPL_TIMING_RPFACT-HPL_TIMING_BEG] );
  272. /*
  273. * Panel factorization
  274. */
  275. if( HPL_w[HPL_TIMING_PFACT-HPL_TIMING_BEG] > HPL_rzero )
  276. HPL_fprintf( TEST->outfp,
  277. "+ Max aggregated wall time pfact . . : %18.2f\n",
  278. HPL_w[HPL_TIMING_PFACT-HPL_TIMING_BEG] );
  279. /*
  280. * Panel factorization (swap)
  281. */
  282. if( HPL_w[HPL_TIMING_MXSWP-HPL_TIMING_BEG] > HPL_rzero )
  283. HPL_fprintf( TEST->outfp,
  284. "+ Max aggregated wall time mxswp . . : %18.2f\n",
  285. HPL_w[HPL_TIMING_MXSWP-HPL_TIMING_BEG] );
  286. /*
  287. * Update
  288. */
  289. if( HPL_w[HPL_TIMING_UPDATE-HPL_TIMING_BEG] > HPL_rzero )
  290. HPL_fprintf( TEST->outfp,
  291. "Max aggregated wall time update . . : %18.2f\n",
  292. HPL_w[HPL_TIMING_UPDATE-HPL_TIMING_BEG] );
  293. /*
  294. * Update (swap)
  295. */
  296. if( HPL_w[HPL_TIMING_LASWP-HPL_TIMING_BEG] > HPL_rzero )
  297. HPL_fprintf( TEST->outfp,
  298. "+ Max aggregated wall time laswp . . : %18.2f\n",
  299. HPL_w[HPL_TIMING_LASWP-HPL_TIMING_BEG] );
  300. /*
  301. * Upper triangular system solve
  302. */
  303. if( HPL_w[HPL_TIMING_PTRSV-HPL_TIMING_BEG] > HPL_rzero )
  304. HPL_fprintf( TEST->outfp,
  305. "Max aggregated wall time up tr sv . : %18.2f\n",
  306. HPL_w[HPL_TIMING_PTRSV-HPL_TIMING_BEG] );
  307. if( TEST->thrsh <= HPL_rzero )
  308. HPL_fprintf( TEST->outfp, "%s%s\n",
  309. "========================================",
  310. "========================================" );
  311. }
  312. #endif
  313. /*
  314. * Quick return, if I am not interested in checking the computations
  315. */
  316. if( TEST->thrsh <= HPL_rzero )
  317. { (TEST->kpass)++; if( vptr ) free( vptr ); return; }
  318. /*
  319. * Check info returned by solve
  320. */
  321. if( mat.info != 0 )
  322. {
  323. if( ( myrow == 0 ) && ( mycol == 0 ) )
  324. HPL_pwarn( TEST->outfp, __LINE__, "HPL_pdtest", "%s %d, %s",
  325. "Error code returned by solve is", mat.info, "skip" );
  326. (TEST->kskip)++;
  327. if( vptr ) free( vptr ); return;
  328. }
  329. /*
  330. * Check computation, re-generate [ A | b ], compute norm 1 and inf of A and x,
  331. * and norm inf of b - A x. Display residual checks.
  332. */
  333. HPL_pdmatgen( GRID, N, N+1, NB, mat.A, mat.ld, HPL_ISEED );
  334. Anorm1 = HPL_pdlange( GRID, HPL_NORM_1, N, N, NB, mat.A, mat.ld );
  335. AnormI = HPL_pdlange( GRID, HPL_NORM_I, N, N, NB, mat.A, mat.ld );
  336. /*
  337. * Because x is distributed in process rows, switch the norms
  338. */
  339. XnormI = HPL_pdlange( GRID, HPL_NORM_1, 1, N, NB, mat.X, 1 );
  340. Xnorm1 = HPL_pdlange( GRID, HPL_NORM_I, 1, N, NB, mat.X, 1 );
  341. /*
  342. * If I am in the col that owns b, (1) compute local BnormI, (2) all_reduce to
  343. * find the max (in the col). Then (3) broadcast along the rows so that every
  344. * process has BnormI. Note that since we use a uniform distribution in [-0.5,0.5]
  345. * for the entries of B, it is very likely that BnormI (<=,~) 0.5.
  346. */
  347. Bptr = Mptr( mat.A, 0, nq, mat.ld );
  348. if( mycol == HPL_indxg2p( N, NB, NB, 0, npcol ) ){
  349. if( mat.mp > 0 )
  350. {
  351. BnormI = Bptr[HPL_idamax( mat.mp, Bptr, 1 )]; BnormI = Mabs( BnormI );
  352. }
  353. else
  354. {
  355. BnormI = HPL_rzero;
  356. }
  357. (void) HPL_all_reduce( (void *)(&BnormI), 1, HPL_DOUBLE, HPL_max,
  358. GRID->col_comm );
  359. }
  360. (void) HPL_broadcast( (void *)(&BnormI), 1, HPL_DOUBLE,
  361. HPL_indxg2p( N, NB, NB, 0, npcol ),
  362. GRID->row_comm );
  363. /*
  364. * If I own b, compute ( b - A x ) and ( - A x ) otherwise
  365. */
  366. if( mycol == HPL_indxg2p( N, NB, NB, 0, npcol ) )
  367. {
  368. HPL_dgemv( HplColumnMajor, HplNoTrans, mat.mp, nq, -HPL_rone,
  369. mat.A, mat.ld, mat.X, 1, HPL_rone, Bptr, 1 );
  370. }
  371. else if( nq > 0 )
  372. {
  373. HPL_dgemv( HplColumnMajor, HplNoTrans, mat.mp, nq, -HPL_rone,
  374. mat.A, mat.ld, mat.X, 1, HPL_rzero, Bptr, 1 );
  375. }
  376. else { for( ii = 0; ii < mat.mp; ii++ ) Bptr[ii] = HPL_rzero; }
  377. /*
  378. * Reduce the distributed residual in process column 0
  379. */
  380. if( mat.mp > 0 )
  381. (void) HPL_reduce( Bptr, mat.mp, HPL_DOUBLE, HPL_sum, 0,
  382. GRID->row_comm );
  383. /*
  384. * Compute || b - A x ||_oo
  385. */
  386. resid0 = HPL_pdlange( GRID, HPL_NORM_I, N, 1, NB, Bptr, mat.ld );
  387. /*
  388. * Computes and displays norms, residuals ...
  389. */
  390. if( N <= 0 )
  391. {
  392. resid1 = HPL_rzero;
  393. }
  394. else
  395. {
  396. resid1 = resid0 / ( TEST->epsil * ( AnormI * XnormI + BnormI ) * (double)(N) );
  397. }
  398. if( resid1 < TEST->thrsh ) (TEST->kpass)++;
  399. else (TEST->kfail)++;
  400. if( ( myrow == 0 ) && ( mycol == 0 ) )
  401. {
  402. HPL_fprintf( TEST->outfp, "%s%s\n",
  403. "----------------------------------------",
  404. "----------------------------------------" );
  405. HPL_fprintf( TEST->outfp, "%s%16.7f%s%s\n",
  406. "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ", resid1,
  407. " ...... ", ( resid1 < TEST->thrsh ? "PASSED" : "FAILED" ) );
  408. if( resid1 >= TEST->thrsh )
  409. {
  410. HPL_fprintf( TEST->outfp, "%s%18.6f\n",
  411. "||Ax-b||_oo . . . . . . . . . . . . . . . . . = ", resid0 );
  412. HPL_fprintf( TEST->outfp, "%s%18.6f\n",
  413. "||A||_oo . . . . . . . . . . . . . . . . . . . = ", AnormI );
  414. HPL_fprintf( TEST->outfp, "%s%18.6f\n",
  415. "||A||_1 . . . . . . . . . . . . . . . . . . . = ", Anorm1 );
  416. HPL_fprintf( TEST->outfp, "%s%18.6f\n",
  417. "||x||_oo . . . . . . . . . . . . . . . . . . . = ", XnormI );
  418. HPL_fprintf( TEST->outfp, "%s%18.6f\n",
  419. "||x||_1 . . . . . . . . . . . . . . . . . . . = ", Xnorm1 );
  420. HPL_fprintf( TEST->outfp, "%s%18.6f\n",
  421. "||b||_oo . . . . . . . . . . . . . . . . . . . = ", BnormI );
  422. }
  423. }
  424. if( vptr ) free( vptr );
  425. /*
  426. * End of HPL_pdtest
  427. */
  428. }