timing.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748
  1. /**
  2. *
  3. * @file time_main.c
  4. *
  5. * PLASMA auxiliary routines
  6. * PLASMA is a software package provided by Univ. of Tennessee,
  7. * Univ. of California Berkeley and Univ. of Colorado Denver
  8. *
  9. * @version 2.3.1
  10. * @author ???
  11. * @author Mathieu Faverge
  12. * @date 2010-11-15
  13. *
  14. **/
  15. /* Define these so that the Microsoft VC compiler stops complaining
  16. about scanf and friends */
  17. #define _CRT_SECURE_NO_DEPRECATE
  18. #define _CRT_SECURE_NO_WARNINGS
  19. #include <math.h>
  20. #include <stdio.h>
  21. #include <stdlib.h>
  22. #include <string.h>
  23. #ifdef PLASMA_EZTRACE
  24. #include <eztrace.h>
  25. #endif
  26. #if defined( _WIN32 ) || defined( _WIN64 )
  27. #include <windows.h>
  28. #include <time.h>
  29. #include <sys/timeb.h>
  30. #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
  31. #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
  32. #else
  33. #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
  34. #endif
  35. struct timezone
  36. {
  37. int tz_minuteswest; /* minutes W of Greenwich */
  38. int tz_dsttime; /* type of dst correction */
  39. };
  40. int gettimeofday(struct timeval* tv, struct timezone* tz)
  41. {
  42. FILETIME ft;
  43. unsigned __int64 tmpres = 0;
  44. static int tzflag;
  45. if (NULL != tv)
  46. {
  47. GetSystemTimeAsFileTime(&ft);
  48. tmpres |= ft.dwHighDateTime;
  49. tmpres <<= 32;
  50. tmpres |= ft.dwLowDateTime;
  51. /*converting file time to unix epoch*/
  52. tmpres /= 10; /*convert into microseconds*/
  53. tmpres -= DELTA_EPOCH_IN_MICROSECS;
  54. tv->tv_sec = (long)(tmpres / 1000000UL);
  55. tv->tv_usec = (long)(tmpres % 1000000UL);
  56. }
  57. if (NULL != tz)
  58. {
  59. if (!tzflag)
  60. {
  61. _tzset();
  62. tzflag++;
  63. }
  64. tz->tz_minuteswest = _timezone / 60;
  65. tz->tz_dsttime = _daylight;
  66. }
  67. return 0;
  68. }
  69. #else /* Non-Windows */
  70. #include <unistd.h>
  71. #include <sys/time.h>
  72. #include <sys/resource.h>
  73. #endif
  74. #include <cblas.h>
  75. #include <lapacke.h>
  76. #include <plasma.h>
  77. #include <core_blas.h>
  78. #include <magma_morse.h>
  79. #include <sched_ctx_hypervisor.h>
  80. #include "timing.h"
  81. #include "auxiliary.h"
  82. #include <pthread.h>
  83. static int RunTest(int *iparam, _PREC *dparam, double *t_);
  84. double cWtime(void);
  85. int ISEED[4] = {0,0,0,1}; /* initial seed for zlarnv() */
  86. /*
  87. * struct timeval {time_t tv_sec; suseconds_t tv_usec;};
  88. */
  89. double cWtime(void)
  90. {
  91. struct timeval tp;
  92. gettimeofday( &tp, NULL );
  93. return tp.tv_sec + 1e-6 * tp.tv_usec;
  94. }
  95. double *t1, *t2;
  96. _PREC dparam1[TIMING_DNBPARAM];
  97. _PREC dparam2[TIMING_DNBPARAM];
  98. static int
  99. Test(int64_t n, int *iparam) {
  100. int i, j, iter, m;
  101. int thrdnbr, niter, nrhs;
  102. double *t;
  103. _PREC eps = _LAMCH( 'e' );
  104. _PREC dparam[TIMING_DNBPARAM];
  105. double flops, fmuls, fadds, fp_per_mul, fp_per_add;
  106. double sumgf, sumgf2, sumt, sd, gflops;
  107. double flops_2, fmuls_2, fadds_2;
  108. double sumgf_2, sumgf2_2, sumt_2, sd_2, gflops_2;
  109. char *s;
  110. char *env[] = {
  111. "OMP_NUM_THREADS",
  112. "MKL_NUM_THREADS",
  113. "GOTO_NUM_THREADS",
  114. "ACML_NUM_THREADS",
  115. "ATLAS_NUM_THREADS",
  116. "BLAS_NUM_THREADS", ""
  117. };
  118. int gnuplot = 0;
  119. thrdnbr = iparam[TIMING_THRDNBR];
  120. niter = iparam[TIMING_NITER];
  121. nrhs = iparam[TIMING_NRHS];
  122. if (n < 0 || thrdnbr < 0) {
  123. const char *bound_header = iparam[TIMING_BOUND] ? " thGflop/s" : "";
  124. const char *check_header = iparam[TIMING_CHECK] ? " ||Ax-b|| ||A|| ||x|| ||b|| eps ||Ax-b||/N/eps/(||A||||x||+||b||)" : "";
  125. const char *peak_header = iparam[TIMING_PEAK] ? " (\% of peak) peak" : "";
  126. printf( "# N NRHS threads seconds Gflop/s Deviation %s%s%s\n", bound_header, peak_header, check_header);
  127. if (gnuplot) {
  128. printf( "set title '%d_NUM_THREADS: ", thrdnbr );
  129. for (i = 0; env[i][0]; ++i) {
  130. s = getenv( env[i] );
  131. if (i) printf( " " ); /* separating space */
  132. for (j = 0; j < 5 && env[i][j] && env[i][j] != '_'; ++j)
  133. printf( "%c", env[i][j] );
  134. if (s)
  135. printf( "=%s", s );
  136. else
  137. printf( "->%s", "?" );
  138. }
  139. printf( "'\n" );
  140. printf( "%s\n%s\n%s\n%s\n%s%s%s\n",
  141. "set xlabel 'Matrix size'",
  142. "set ylabel 'Gflop/s'",
  143. "set key bottom",
  144. gnuplot > 1 ? "set terminal png giant\nset output 'timeplot.png'" : "",
  145. "plot '-' using 1:5 title '", _NAME, "' with linespoints" );
  146. }
  147. return 0;
  148. }
  149. printf( "%5d %4d %5d ", iparam[TIMING_N], iparam[TIMING_NRHS], iparam[TIMING_THRDNBR] );
  150. printf( "%5d %4d %5d ", iparam[TIMING_N2], iparam[TIMING_NRHS], iparam[TIMING_THRDNBR] );
  151. fflush( stdout );
  152. t = (double*)malloc(niter*sizeof(double));
  153. memset(t, 0, niter*sizeof(double));
  154. t1 = (double*)malloc(niter*sizeof(double));
  155. memset(t, 0, niter*sizeof(double));
  156. t2 = (double*)malloc(niter*sizeof(double));
  157. memset(t, 0, niter*sizeof(double));
  158. if (sizeof(_TYPE) == sizeof(_PREC)) {
  159. fp_per_mul = 1;
  160. fp_per_add = 1;
  161. } else {
  162. fp_per_mul = 6;
  163. fp_per_add = 2;
  164. }
  165. m = iparam[TIMING_M];
  166. n = iparam[TIMING_N];
  167. fadds = _FADDS;
  168. fmuls = _FMULS;
  169. flops = fmuls * fp_per_mul + fadds * fp_per_add;
  170. gflops = 0.0;
  171. m = iparam[TIMING_M2];
  172. n = iparam[TIMING_N2];
  173. fadds_2 = _FADDS;
  174. fmuls_2 = _FMULS;
  175. flops_2 = fmuls_2 * fp_per_mul + fadds_2 * fp_per_add;
  176. gflops_2 = 0.0;
  177. if ( iparam[TIMING_WARMUP] ) {
  178. RunTest( iparam, dparam, &(t[0]));
  179. }
  180. sumgf = 0.0;
  181. double sumgf_upper = 0.0;
  182. sumgf2 = 0.0;
  183. sumt = 0.0;
  184. sumgf_2 = 0.0;
  185. double sumgf_upper_2 = 0.0;
  186. sumgf2_2 = 0.0;
  187. sumt_2 = 0.0;
  188. for (iter = 0; iter < niter; iter++)
  189. {
  190. #ifdef PLASMA_EZTRACE
  191. if( iter == 0 ) {
  192. eztrace_start();
  193. RunTest( iparam, dparam, &(t[iter]));
  194. eztrace_stop();
  195. }
  196. else
  197. #endif
  198. RunTest( iparam, dparam, &(t[iter]));
  199. double tmin = 0.0;
  200. double integer_tmin = 0.0;
  201. double upper_gflops = 0.0;
  202. double tmin_2 = 0.0;
  203. double integer_tmin_2 = 0.0;
  204. double upper_gflops_2 = 0.0;
  205. #if 0
  206. if (iparam[TIMING_BOUND])
  207. {
  208. if (iparam[TIMING_BOUNDDEPS]) {
  209. FILE *out = fopen("bounddeps.pl", "w");
  210. starpu_bound_print_lp(out);
  211. fclose(out);
  212. out = fopen("bound.dot", "w");
  213. starpu_bound_print_dot(out);
  214. fclose(out);
  215. } else {
  216. #if 0
  217. FILE *out = fopen("bound.pl", "w");
  218. starpu_bound_print_lp(out);
  219. fclose(out);
  220. #endif
  221. starpu_bound_compute(&tmin, &integer_tmin, 0);
  222. upper_gflops = ((1e-6 * flops) / tmin);
  223. starpu_bound_compute(&tmin_2, &integer_tmin_2, 0);
  224. upper_gflops_2 = ((1e-6 * flops_2) / tmin_2);
  225. }
  226. }
  227. #endif
  228. printf("t1 = %lf t2 = %lf \n", t1[0], t2[0]);
  229. gflops = (1e-9 * flops) / t1[iter];
  230. sumt += t1[iter];
  231. sumgf_upper += upper_gflops;
  232. sumgf += gflops;
  233. sumgf2 += gflops*gflops;
  234. gflops_2 = (1e-9 * flops_2) / t2[iter];
  235. sumt_2 += t2[iter];
  236. sumgf_upper_2 += upper_gflops_2;
  237. sumgf_2 += gflops_2;
  238. sumgf2_2 += gflops_2*gflops_2;
  239. }
  240. gflops = sumgf / niter;
  241. sd = sqrt((sumgf2 - (sumgf*sumgf)/niter)/niter);
  242. gflops_2 = sumgf_2 / niter;
  243. sd_2 = sqrt((sumgf2_2 - (sumgf_2*sumgf_2)/niter)/niter);
  244. printf( "%9.3f %9.2f +-%7.2f ", sumt/niter, gflops, sd);
  245. if (iparam[TIMING_BOUND] && !iparam[TIMING_BOUNDDEPS])
  246. printf(" %9.2f", sumgf_upper/niter);
  247. if ( iparam[TIMING_PEAK] )
  248. {
  249. if (dparam1[TIMING_ESTIMATED_PEAK]<0.0f)
  250. printf(" n/a n/a ");
  251. else
  252. printf(" %2.2f\%% %9.2f ", 100.0f*(gflops/dparam1[TIMING_ESTIMATED_PEAK]), dparam1[TIMING_ESTIMATED_PEAK]);
  253. }
  254. if ( iparam[TIMING_CHECK] )
  255. printf( "%8.5e %8.5e %8.5e %8.5e %8.5e %8.5e",
  256. dparam1[TIMING_RES], dparam1[TIMING_ANORM], dparam1[TIMING_XNORM], dparam1[TIMING_BNORM], eps,
  257. dparam1[TIMING_RES] / n / eps / (dparam1[TIMING_ANORM] * dparam1[TIMING_XNORM] + dparam1[TIMING_BNORM] ));
  258. printf("\n");
  259. printf( "%9.3f %9.2f +-%7.2f ", sumt_2/niter, gflops_2, sd_2);
  260. if (iparam[TIMING_BOUND] && !iparam[TIMING_BOUNDDEPS])
  261. printf(" %9.2f", sumgf_upper_2/niter);
  262. if ( iparam[TIMING_PEAK] )
  263. {
  264. if (dparam2[TIMING_ESTIMATED_PEAK]<0.0f)
  265. printf(" n/a n/a ");
  266. else
  267. printf(" %2.2f\%% %9.2f ", 100.0f*(gflops_2/dparam2[TIMING_ESTIMATED_PEAK]), dparam2[TIMING_ESTIMATED_PEAK]);
  268. }
  269. if ( iparam[TIMING_CHECK] )
  270. printf( "%8.5e %8.5e %8.5e %8.5e %8.5e %8.5e",
  271. dparam2[TIMING_RES], dparam2[TIMING_ANORM], dparam2[TIMING_XNORM], dparam2[TIMING_BNORM], eps,
  272. dparam2[TIMING_RES] / n / eps / (dparam2[TIMING_ANORM] * dparam2[TIMING_XNORM] + dparam2[TIMING_BNORM] ));
  273. printf("\n");
  274. fflush( stdout );
  275. free(t);
  276. free(t1);
  277. free(t2);
  278. return 0;
  279. }
  280. static int
  281. startswith(const char *s, const char *prefix) {
  282. size_t n = strlen( prefix );
  283. if (strncmp( s, prefix, n ))
  284. return 0;
  285. return 1;
  286. }
  287. static int
  288. get_range(char *range, int *start_p, int *stop_p, int *step_p) {
  289. char *s, *s1, buf[21];
  290. int colon_count, copy_len, nbuf=20, n;
  291. int start=1000, stop=10000, step=1000;
  292. colon_count = 0;
  293. for (s = strchr( range, ':'); s; s = strchr( s+1, ':'))
  294. colon_count++;
  295. if (colon_count == 0) { /* No colon in range. */
  296. if (sscanf( range, "%d", &start ) < 1 || start < 1)
  297. return -1;
  298. step = start / 10;
  299. if (step < 1) step = 1;
  300. stop = start + 10 * step;
  301. } else if (colon_count == 1) { /* One colon in range.*/
  302. /* First, get the second number (after colon): the stop value. */
  303. s = strchr( range, ':' );
  304. if (sscanf( s+1, "%d", &stop ) < 1 || stop < 1)
  305. return -1;
  306. /* Next, get the first number (before colon): the start value. */
  307. n = s - range;
  308. copy_len = n > nbuf ? nbuf : n;
  309. strncpy( buf, range, copy_len );
  310. buf[copy_len] = 0;
  311. if (sscanf( buf, "%d", &start ) < 1 || start > stop || start < 1)
  312. return -1;
  313. /* Let's have 10 steps or less. */
  314. step = (stop - start) / 10;
  315. if (step < 1)
  316. step = 1;
  317. } else if (colon_count == 2) { /* Two colons in range. */
  318. /* First, get the first number (before the first colon): the start value. */
  319. s = strchr( range, ':' );
  320. n = s - range;
  321. copy_len = n > nbuf ? nbuf : n;
  322. strncpy( buf, range, copy_len );
  323. buf[copy_len] = 0;
  324. if(copy_len == 0)
  325. start = 0;
  326. else if (sscanf( buf, "%d", &start ) < 1 || start < 1)
  327. return -1;
  328. /* Next, get the second number (after the first colon): the stop value. */
  329. s1 = strchr( s+1, ':' );
  330. n = s1 - (s + 1);
  331. copy_len = n > nbuf ? nbuf : n;
  332. strncpy( buf, s+1, copy_len );
  333. buf[copy_len] = 0;
  334. if(copy_len == 0)
  335. stop = 0;
  336. else if (sscanf( buf, "%d", &stop ) < 1 || stop < start)
  337. return -1;
  338. /* Finally, get the third number (after the second colon): the step value. */
  339. if (sscanf( s1+1, "%d", &step ) < 1 || step < 1)
  340. return -1;
  341. } else
  342. return -1;
  343. *start_p = start;
  344. *stop_p = stop;
  345. *step_p = step;
  346. return 0;
  347. }
  348. static void
  349. show_help(char *prog_name) {
  350. printf( "Usage:\n%s [options]\n\n", prog_name );
  351. printf( "Options are:\n" );
  352. printf( " --threads=C Number of threads (default: 1)\n" );
  353. printf( " --n_range=R Range of N values: Start:Stop:Step (default: 500:5000:500)\n" );
  354. // printf( " --gnuplot produce output suitable for gnuplot" );
  355. printf( " --[no]check Check result (default: nocheck)\n" );
  356. printf( " --[no]warmup Perform a warmup run to pre-load libraries (default: warmup)\n");
  357. printf( " --parallel=N Use parallel tasks of size N (default: no)\n");
  358. printf( " --niter=N Number of iterations (default: 1)\n");
  359. printf( " --nb=N Nb size. Not used if autotuning is activated (default: 128)\n");
  360. printf( " --ib=N IB size. Not used if autotuning is activated (default: 32)\n");
  361. printf( " --nrhs=N Number of right-hand size (default: 1)\n");
  362. printf( " --[no]dyn Activate Dynamic scheduling (default: nodyn)\n");
  363. printf( " --[no]atun Activate autotuning (default: noatun)\n");
  364. printf( " --ifmt Input format. 0: CM, 1: CCRB, 2: CRRB, 3: RCRB, 4: RRRB, 5: RM (default: 0)\n");
  365. printf( " --ofmt Output format. 0: CM, 1: CCRB, 2: CRRB, 3: RCRB, 4: RRRB, 5: RM (default: 1)\n");
  366. printf( " --thrdbypb Number of threads per subproblem for inplace transformation (default: 1)\n");
  367. printf( " --[no]profile Profile kernels with StarPU (default: no)\n");
  368. printf( " --[no]peak Evalue sustained peak performance (default: no)\n");
  369. }
  370. static void
  371. get_thread_count(int *thrdnbr) {
  372. #if defined WIN32 || defined WIN64
  373. sscanf( getenv( "NUMBER_OF_PROCESSORS" ), "%d", thrdnbr );
  374. #else
  375. *thrdnbr = sysconf(_SC_NPROCESSORS_ONLN);
  376. #endif
  377. }
  378. typedef struct {
  379. PLASMA_enum uplo;
  380. magma_desc_t *descA;
  381. unsigned ctx;
  382. unsigned the_other_ctx;
  383. real_Double_t t;
  384. } params;
  385. double compute_flops(int n, int m)
  386. {
  387. double fp_per_mul, fp_per_add;
  388. if (sizeof(_TYPE) == sizeof(_PREC)) {
  389. fp_per_mul = 1;
  390. fp_per_add = 1;
  391. } else {
  392. fp_per_mul = 6;
  393. fp_per_add = 2;
  394. }
  395. double fmuls = (n * (1.0 / 6.0 * n + 0.5) * n);
  396. double fadds = (n * (1.0 / 6.0 * n ) * n);
  397. double flops = fmuls * fp_per_mul + fadds * fp_per_add;
  398. return flops;
  399. }
  400. params p1, p2;
  401. int
  402. main(int argc, char *argv[]) {
  403. int i;
  404. int start = 500;
  405. int stop = 5000;
  406. int step = 500;
  407. int start1 = 500;
  408. int stop1 = 5000;
  409. int step1 = 500;
  410. int start2 = 500;
  411. int stop2 = 5000;
  412. int step2 = 500;
  413. int start_cpus1 = 0, start_cpus2 = 0, start_gpus1 = 0, start_gpus2 = 0;
  414. int stop_cpus1 = -1, stop_cpus2 = -1, stop_gpus1 = -1, stop_gpus2 = -1;
  415. int step_cpus1 = 1, step_cpus2 = 1, step_gpus1 = 1, step_gpus2 = 1;
  416. int iparam[TIMING_INBPARAM];
  417. memset(iparam, 0, TIMING_INBPARAM*sizeof(int));
  418. iparam[TIMING_CHECK ] = 0;
  419. iparam[TIMING_WARMUP ] = 1;
  420. iparam[TIMING_NITER ] = 1;
  421. iparam[TIMING_N ] = 500;
  422. iparam[TIMING_N2 ] = 500;
  423. iparam[TIMING_NB ] = 128;
  424. iparam[TIMING_IB ] = 32;
  425. iparam[TIMING_NRHS ] = 1;
  426. iparam[TIMING_THRDNBR ] = 1;
  427. iparam[TIMING_NCUDAS ] = 0;
  428. iparam[TIMING_THRDNBR_SUBGRP] = 1;
  429. iparam[TIMING_SCHEDULER ] = 0;
  430. iparam[TIMING_AUTOTUNING ] = 1;
  431. iparam[TIMING_INPUTFMT ] = 0;
  432. iparam[TIMING_OUTPUTFMT ] = 0;
  433. iparam[TIMING_NDOM ] = 1;
  434. iparam[TIMING_PROFILE ] = 0;
  435. iparam[TIMING_PEAK ] = 0;
  436. iparam[TIMING_PARALLEL_TASKS] = 0;
  437. iparam[TIMING_NO_CPU ] = 0;
  438. iparam[TIMING_BOUND ] = 0;
  439. iparam[TIMING_BOUNDDEPS ] = 0;
  440. iparam[TIMING_BOUNDDEPSPRIO ] = 0;
  441. iparam[TIMING_WITH_CTXS ] = 1;
  442. get_thread_count( &(iparam[TIMING_THRDNBR]) );
  443. for (i = 1; i < argc && argv[i]; ++i) {
  444. if (startswith( argv[i], "--help" )) {
  445. show_help( argv[0] );
  446. return EXIT_SUCCESS;
  447. } else if (startswith( argv[i], "--n_cpus1=" )) {
  448. get_range( strchr( argv[i], '=' ) + 1, &start_cpus1, &stop_cpus1, &step_cpus1 );
  449. } else if (startswith( argv[i], "--n_cpus2=" )) {
  450. get_range( strchr( argv[i], '=' ) + 1, &start_cpus2, &stop_cpus2, &step_cpus2 );
  451. } else if (startswith( argv[i], "--n_gpus1=" )) {
  452. get_range( strchr( argv[i], '=' ) + 1, &start_gpus1, &stop_gpus1, &step_gpus1 );
  453. } else if (startswith( argv[i], "--n_gpus2=" )) {
  454. get_range( strchr( argv[i], '=' ) + 1, &start_gpus2, &stop_gpus2, &step_gpus2 );
  455. } else if (startswith( argv[i], "--n_range=" )) {
  456. get_range( strchr( argv[i], '=' ) + 1, &start, &stop, &step );
  457. } else if (startswith( argv[i], "--n_range1=" )) {
  458. get_range( strchr( argv[i], '=' ) + 1, &start1, &stop1, &step1 );
  459. } else if (startswith( argv[i], "--n_range2=" )) {
  460. get_range( strchr( argv[i], '=' ) + 1, &start2, &stop2, &step2 );
  461. } else if (startswith( argv[i], "--threads=" )) {
  462. sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_THRDNBR]) );
  463. /* } else if (startswith( argv[i], "--gnuplot-png" )) { */
  464. /* gnuplot = 2; */
  465. /* } else if (startswith( argv[i], "--gnuplot" )) { */
  466. /* gnuplot = 1; */
  467. } else if (startswith( argv[i], "--noctxs" )) {
  468. iparam[TIMING_WITH_CTXS] = 0;
  469. } else if (startswith( argv[i], "--check" )) {
  470. iparam[TIMING_CHECK] = 1;
  471. } else if (startswith( argv[i], "--nocheck" )) {
  472. iparam[TIMING_CHECK] = 0;
  473. } else if (startswith( argv[i], "--warmup" )) {
  474. iparam[TIMING_WARMUP] = 1;
  475. } else if (startswith( argv[i], "--nowarmup" )) {
  476. iparam[TIMING_WARMUP] = 0;
  477. } else if (startswith( argv[i], "--dyn" )) {
  478. iparam[TIMING_SCHEDULER] = 1;
  479. } else if (startswith( argv[i], "--nodyn" )) {
  480. iparam[TIMING_SCHEDULER] = 0;
  481. } else if (startswith( argv[i], "--atun" )) {
  482. iparam[TIMING_AUTOTUNING] = 1;
  483. } else if (startswith( argv[i], "--noatun" )) {
  484. iparam[TIMING_AUTOTUNING] = 0;
  485. } else if (startswith( argv[i], "--profile" )) {
  486. iparam[TIMING_PROFILE] = 1;
  487. } else if (startswith( argv[i], "--peak" )) {
  488. iparam[TIMING_PEAK] = 1;
  489. } else if (startswith( argv[i], "--noprofile" )) {
  490. iparam[TIMING_PROFILE] = 0;
  491. } else if (startswith( argv[i], "--parallel=" )) {
  492. sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_PARALLEL_TASKS]) );
  493. } else if (startswith( argv[i], "--noparallel" )) {
  494. iparam[TIMING_PARALLEL_TASKS] = 0;
  495. } else if (startswith( argv[i], "--nocpu" )) {
  496. iparam[TIMING_NO_CPU] = 1;
  497. } else if (startswith( argv[i], "--nb=" )) {
  498. sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_NB]) );
  499. } else if (startswith( argv[i], "--m=" )) {
  500. sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_M]) );
  501. } else if (startswith( argv[i], "--ib=" )) {
  502. sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_IB]) );
  503. } else if (startswith( argv[i], "--nrhs=" )) {
  504. sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_NRHS]) );
  505. } else if (startswith( argv[i], "--ifmt=" )) {
  506. sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_INPUTFMT]) );
  507. } else if (startswith( argv[i], "--ofmt=" )) {
  508. sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_OUTPUTFMT]) );
  509. } else if (startswith( argv[i], "--thrdbypb=" )) {
  510. sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_THRDNBR_SUBGRP]) );
  511. } else if (startswith( argv[i], "--niter=" )) {
  512. sscanf( strchr( argv[i], '=' ) + 1, "%d", &iparam[TIMING_NITER] );
  513. } else if (startswith( argv[i], "--ndom=" )) {
  514. sscanf( strchr( argv[i], '=' ) + 1, "%d", &iparam[TIMING_NDOM] );
  515. } else if (startswith( argv[i], "--bounddepsprio" )) {
  516. iparam[TIMING_BOUND] = 1;
  517. iparam[TIMING_BOUNDDEPS] = 1;
  518. iparam[TIMING_BOUNDDEPSPRIO] = 1;
  519. } else if (startswith( argv[i], "--bounddeps" )) {
  520. iparam[TIMING_BOUND] = 1;
  521. iparam[TIMING_BOUNDDEPS] = 1;
  522. } else if (startswith( argv[i], "--bound" )) {
  523. iparam[TIMING_BOUND] = 1;
  524. } else {
  525. fprintf( stderr, "Unknown option: %s\n", argv[i] );
  526. }
  527. }
  528. if (step < 1) step = 1;
  529. if (step1 < 1) step1 = 1;
  530. if (step2 < 1) step2 = 1;
  531. /* TODO : correct into plasma */
  532. if ( iparam[TIMING_IB] > iparam[TIMING_NB] )
  533. iparam[TIMING_IB] = iparam[TIMING_NB];
  534. /* TODO */
  535. if (iparam[TIMING_PARALLEL_TASKS]) {
  536. MAGMA_InitPar(iparam[TIMING_THRDNBR]/iparam[TIMING_PARALLEL_TASKS],
  537. iparam[TIMING_NCUDAS],
  538. iparam[TIMING_PARALLEL_TASKS]);
  539. }
  540. else {
  541. MAGMA_Init( iparam[TIMING_THRDNBR],
  542. iparam[TIMING_NCUDAS]);
  543. }
  544. MAGMA_Disable(MAGMA_AUTOTUNING);
  545. MAGMA_Set(MAGMA_TILE_SIZE, iparam[TIMING_NB] );
  546. MAGMA_Set(MAGMA_INNER_BLOCK_SIZE, iparam[TIMING_IB] );
  547. if(iparam[TIMING_WITH_CTXS])
  548. {
  549. int nprocs1 = (stop_cpus1 - start_cpus1 + 1)/step_cpus1 + (stop_gpus1 - start_gpus1 + 1)/step_gpus1;
  550. int nprocs2 = (stop_cpus2 - start_cpus2 + 1)/step_cpus2 + (stop_gpus2 - start_gpus2 + 1)/step_gpus2;
  551. int procs1[nprocs1];
  552. int procs2[nprocs2];
  553. int i, j = 0;
  554. printf("%d: ", nprocs1);
  555. for (i = start_gpus1; i <= stop_gpus1; i += step_gpus1)
  556. {
  557. printf("%d ", i);
  558. procs1[j++] = i;
  559. }
  560. for (i = start_cpus1; i <= stop_cpus1; i += step_cpus1)
  561. {
  562. printf("%d ", i);
  563. procs1[j++] = i;
  564. }
  565. printf("\n");
  566. printf("%d: ", nprocs2);
  567. j = 0;
  568. for (i = start_gpus2; i <= stop_gpus2; i += step_gpus2)
  569. {
  570. printf("%d ", i);
  571. procs2[j++] = i;
  572. }
  573. for (i = start_cpus2; i <= stop_cpus2; i += step_cpus2)
  574. {
  575. printf("%d ", i);
  576. procs2[j++] = i;
  577. }
  578. printf("\n");
  579. struct starpu_sched_ctx_hypervisor_criteria *criteria = sched_ctx_hypervisor_init(SIMPLE_POLICY);
  580. p1.ctx = starpu_create_sched_ctx_with_criteria("heft", procs1, nprocs1, "sched_ctx1", &criteria);
  581. p2.ctx = starpu_create_sched_ctx_with_criteria("heft", procs2, nprocs2, "sched_ctx2", &criteria);
  582. /* p1.ctx = starpu_create_sched_ctx("heft", procs1, nprocs1, "sched_ctx1"); */
  583. /* p2.ctx = starpu_create_sched_ctx("heft", procs2, nprocs2, "sched_ctx2"); */
  584. double flops1 = compute_flops(start1, start1);
  585. double flops2 = compute_flops(start2, start2);
  586. printf("flops1 = %lf flops2 = %lf\n", flops1, flops2);
  587. sched_ctx_hypervisor_handle_ctx(p1.ctx, compute_flops(start1, start1));
  588. sched_ctx_hypervisor_handle_ctx(p2.ctx, compute_flops(start2, start2));
  589. p1.the_other_ctx = p2.ctx;
  590. p2.the_other_ctx = p1.ctx;
  591. int procs[12];
  592. for(i = 0; i < 12; i++)
  593. procs[i] = i;
  594. int gpus[3];
  595. for(i = 0; i < 3; i++)
  596. gpus[i] = i;
  597. sched_ctx_hypervisor_ioctl(p1.ctx,
  598. HYPERVISOR_GRANULARITY, 2,
  599. HYPERVISOR_MIN_TASKS, 10,
  600. HYPERVISOR_MIN_WORKERS, 3,
  601. HYPERVISOR_MAX_WORKERS, 12,
  602. HYPERVISOR_FIXED_WORKERS, gpus, 3,
  603. // HYPERVISOR_MAX_IDLE, procs, 12, 40000.0,
  604. // HYPERVISOR_MAX_IDLE, gpus, 3, 10000.0,
  605. NULL);
  606. sched_ctx_hypervisor_ioctl(p2.ctx,
  607. HYPERVISOR_GRANULARITY, 2,
  608. HYPERVISOR_MIN_TASKS, 10,
  609. HYPERVISOR_MIN_WORKERS, 0,
  610. HYPERVISOR_MAX_WORKERS, 12,
  611. HYPERVISOR_FIXED_WORKERS, gpus, 3,
  612. // HYPERVISOR_MAX_IDLE, procs, 12, 40000.0,
  613. // HYPERVISOR_MAX_IDLE, gpus, 3, 10000.0,
  614. NULL);
  615. }
  616. else
  617. {
  618. p1.ctx = 0;
  619. p2.ctx = 0;
  620. }
  621. Test( -1, iparam ); /* print header */
  622. iparam[TIMING_N] = start1;
  623. iparam[TIMING_N2] = start2;
  624. if ( iparam[TIMING_M] == 0 )
  625. iparam[TIMING_M] = iparam[TIMING_N];
  626. if ( iparam[TIMING_M2] == 0 )
  627. iparam[TIMING_M2] = iparam[TIMING_N2];
  628. Test( start1, iparam );
  629. MAGMA_Finalize();
  630. if(iparam[TIMING_WITH_CTXS])
  631. sched_ctx_hypervisor_shutdown();
  632. /* if (gnuplot) { */
  633. /* printf( "%s\n%s\n", */
  634. /* "e", */
  635. /* gnuplot > 1 ? "" : "pause 10" ); */
  636. /* } */
  637. return EXIT_SUCCESS;
  638. }