energy_efficiency.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2016-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2016 Bérangère Subervie
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <stdbool.h>
  18. #include <starpu.h>
  19. #include <limits.h>
  20. #include "../helper.h"
  21. /*
  22. * This tries to run kernels with different efficiency depending on the core
  23. * frequency.
  24. *
  25. * This is based on the Cholesky factorization, which is made to exhibit three
  26. * caricatural cases as follows:
  27. *
  28. * - gemm: always get faster with higher frequency
  29. * - trsm: gets faster with higher frequency, but efficiency gets lower and
  30. * lower
  31. * - potrf: reaches a maximum performance, after which there is no point in
  32. * running it at higher frequency.
  33. *
  34. * We here assume that the power use is the same for the different kernels
  35. * (which wouldn't be true for real kernels, measurements would be needed, to
  36. * feed the performance models).
  37. */
  38. /* These are the different frequency and power parameters, as measured and
  39. * provided to this program */
  40. static float freq_min, freq_fast;
  41. static float power_min, power_fast;
  42. /*
  43. * This returns the dynamic power used by a CPU core in W at a given frequency
  44. * in MHz
  45. * This assumes C.V^2.F with V being proportional to F, thus C.F^3
  46. freq_min = 1200
  47. freq_fast = 3500
  48. power_min = 2
  49. power_fast = 8.2
  50. freq_min3 = freq_min * freq_min * freq_min
  51. freq_fast3 = freq_fast * freq_fast * freq_fast
  52. alpha = (power_fast - power_min) / (freq_fast3 - freq_min3)
  53. power(frequency) = power_min + alpha * (frequency*frequency*frequency - freq_min3)
  54. plot [frequency=freq_min:freq_fast] power(frequency) lw 2
  55. */
  56. static float power(float frequency)
  57. {
  58. double freq_min3 = freq_min * freq_min * freq_min;
  59. double freq_fast3 = freq_fast * freq_fast * freq_fast;
  60. double alpha = (power_fast - power_min) / (freq_fast3 - freq_min3);
  61. return power_min + alpha * ( frequency*frequency*frequency - freq_min3);
  62. }
  63. /*
  64. * This returns the frequency of the given worker and implementation in MHz.
  65. * This is where we can tune either a given number of cores at a low frequency,
  66. * or which implementation uses which frequency. */
  67. /* These are the chosen parameters: how many cores get slowed down, at which
  68. * frequency */
  69. static int ncpu_slow = -1;
  70. static float freq_slow;
  71. static float frequency(int worker, unsigned i)
  72. {
  73. if (ncpu_slow == -1)
  74. {
  75. /* Version that allows the runtime to switch speed between
  76. * tasks, by exposing two implementations with different time
  77. * and energy */
  78. if (i == 0)
  79. /* Slow implementation */
  80. return freq_slow;
  81. else
  82. /* Fast implementation */
  83. return freq_fast;
  84. }
  85. else
  86. {
  87. /* Version that assumes that ncpu_slow workers are running at
  88. * slow speed */
  89. if (worker < ncpu_slow)
  90. return freq_slow;
  91. else
  92. return freq_fast;
  93. }
  94. }
  95. /* This is from magma
  96. -- Innovative Computing Laboratory
  97. -- Electrical Engineering and Computer Science Department
  98. -- University of Tennessee
  99. -- (C) Copyright 2009
  100. Redistribution and use in source and binary forms, with or without
  101. modification, are permitted provided that the following conditions
  102. are met:
  103. * Redistributions of source code must retain the above copyright
  104. notice, this list of conditions and the following disclaimer.
  105. * Redistributions in binary form must reproduce the above copyright
  106. notice, this list of conditions and the following disclaimer in the
  107. documentation and/or other materials provided with the distribution.
  108. * Neither the name of the University of Tennessee, Knoxville nor the
  109. names of its contributors may be used to endorse or promote products
  110. derived from this software without specific prior written permission.
  111. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  112. ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  113. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  114. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  115. HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  116. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  117. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  118. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  119. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  120. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  121. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  122. */
  123. #define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
  124. #define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) ) * (double)(__n) - (1. / 6.)))
  125. #define FLOPS_SPOTRF(__n) ( FMULS_POTRF((__n)) + FADDS_POTRF((__n)) )
  126. #define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
  127. #define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
  128. #define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
  129. #define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
  130. #define FMULS_TRSM FMULS_TRMM
  131. #define FADDS_TRSM FMULS_TRMM
  132. #define FLOPS_STRSM(__m, __n) ( FMULS_TRSM((__m), (__n)) + FADDS_TRSM((__m), (__n)) )
  133. #define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
  134. #define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
  135. #define FLOPS_SGEMM(__m, __n, __k) ( FMULS_GEMM((__m), (__n), (__k)) + FADDS_GEMM((__m), (__n), (__k)) )
  136. /* Tags for spotting tasks in the trace */
  137. #define TAG11(k) ((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
  138. #define TAG21(k,j) ((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32) \
  139. | (unsigned long long)(j))))
  140. #define TAG22(k,i,j) ((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) \
  141. | ((unsigned long long)(i)<<16) \
  142. | (unsigned long long)(j))))
  143. /* Arbitrary tile size */
  144. #define TILE_SIZE 512
  145. /*
  146. * Kernel time performance models, would normally be provided by measurements
  147. */
  148. /* We assume that GEMM scales perfectly with frequency */
  149. #define GEMM_GFLOPS 50. /* At full speed */
  150. #define GEMM_FLOPS(N) FLOPS_SGEMM(N, N, N)
  151. #define GEMM_TIME(N) (GEMM_FLOPS(TILE_SIZE) / (GEMM_GFLOPS * 1000000000.))
  152. static double _gemm_time(float frequency)
  153. {
  154. double ret = GEMM_TIME(N);
  155. /* Fix according to real frequency, linear */
  156. ret = GEMM_TIME(N) / (frequency / freq_fast);
  157. return ret * 1000000.;
  158. }
  159. static double gemm_time(struct starpu_task *t, unsigned workerid, unsigned i)
  160. {
  161. (void)t;
  162. return _gemm_time(frequency(workerid, i));
  163. }
  164. /* We assume that TRSM decays a bit with frequency */
  165. #define TRSM_DECAY 0.5
  166. #define TRSM_FLOPS(N) FLOPS_STRSM(N, N)
  167. static double _trsm_time(float frequency)
  168. {
  169. double ret = GEMM_TIME(N)*0.7; /* as typically observed */
  170. /* Fix according to real frequency, root */
  171. ret = ret / (pow(frequency - freq_min/2, TRSM_DECAY) / pow(freq_fast - freq_min/2, TRSM_DECAY));
  172. return ret * 1000000.;
  173. }
  174. static double trsm_time(struct starpu_task *t, unsigned workerid, unsigned i)
  175. {
  176. (void)t;
  177. return _trsm_time(frequency(workerid, i));
  178. }
  179. /* We assume that POTRF decays strongly with frequency */
  180. #define POTRF_DECAY 0.5
  181. #define POTRF_FLOPS(N) FLOPS_SPOTRF(N)
  182. static double _potrf_time(float frequency)
  183. {
  184. double ret = GEMM_TIME(N)*1.2; /* as typically observed */
  185. /* Fix according to real frequency, asymptote */
  186. ret = ret / (1. - POTRF_DECAY * ((freq_min/(frequency-freq_min/2)) - (freq_min/(freq_fast-freq_min/2))));
  187. return ret * 1000000.;
  188. }
  189. static double potrf_time(struct starpu_task *t, unsigned workerid, unsigned i)
  190. {
  191. (void)t;
  192. return _potrf_time(frequency(workerid, i));
  193. }
  194. /* stub for kernel, shouldn't be getting called in simgrid mode */
  195. void dummy_func(void *descr[], void *_args)
  196. {
  197. (void)descr; (void)_args;
  198. fprintf(stderr, "?? shouldn't be called\n");
  199. }
  200. /* Define the codelets */
  201. #define CODELET(kernel, nb, ...) \
  202. static double kernel##_energy(struct starpu_task *t, unsigned workerid, unsigned i) \
  203. { \
  204. double time = kernel##_time(t, workerid, i); \
  205. return power(frequency(workerid, i)) * time / 1000000.; \
  206. } \
  207. \
  208. static struct starpu_perfmodel kernel##_perf_model = \
  209. { \
  210. .symbol = #kernel, \
  211. .type = STARPU_PER_WORKER, \
  212. .worker_cost_function = kernel##_time, \
  213. }; \
  214. \
  215. static struct starpu_perfmodel kernel##_energy_model = \
  216. { \
  217. .symbol = #kernel "_energy", \
  218. .type = STARPU_PER_WORKER, \
  219. .worker_cost_function = kernel##_energy, \
  220. }; \
  221. \
  222. static struct starpu_codelet kernel##_cl = \
  223. { \
  224. .cpu_funcs = { dummy_func }, \
  225. .nbuffers = nb, \
  226. .modes = {__VA_ARGS__}, \
  227. .model = &kernel##_perf_model, \
  228. .energy_model = &kernel##_energy_model, \
  229. };
  230. CODELET(potrf, 1, STARPU_RW)
  231. CODELET(trsm, 2, STARPU_R, STARPU_RW)
  232. CODELET(gemm, 3, STARPU_R, STARPU_R, STARPU_RW)
  233. int main(int argc, char *argv[])
  234. {
  235. /* Initialize environment variables */
  236. if (!getenv("STARPU_IDLE_POWER"))
  237. setenv("STARPU_IDLE_POWER", "30", 1);
  238. const char *hostname = getenv("STARPU_HOSTNAME");
  239. if (!hostname || strcmp(hostname, "sirocco"))
  240. {
  241. printf("Warning: This is expected to be run with export STARPU_HOSTNAME=sirocco\n");
  242. }
  243. freq_min = starpu_get_env_number_default("STARPU_FREQ_MIN", 1200);
  244. freq_slow = starpu_get_env_number_default("STARPU_FREQ_SLOW", 1200);
  245. freq_fast = starpu_get_env_number_default("STARPU_FREQ_FAST", 3500);
  246. power_min = starpu_get_env_float_default("STARPU_POWER_MIN", 2);
  247. power_fast = starpu_get_env_float_default("STARPU_POWER_FAST", 8.2);
  248. /* Number of slow CPU cores */
  249. ncpu_slow = starpu_get_env_number_default("STARPU_NCPU_SLOW", -1);
  250. if (ncpu_slow == -1)
  251. {
  252. /* Enable second implementation. */
  253. potrf_cl.cpu_funcs[1] = dummy_func;
  254. trsm_cl.cpu_funcs[1] = dummy_func;
  255. gemm_cl.cpu_funcs[1] = dummy_func;
  256. }
  257. /* Initialize StarPU */
  258. struct starpu_conf conf;
  259. starpu_conf_init(&conf);
  260. starpu_conf_noworker(&conf);
  261. conf.ncpus = -1;
  262. if (!getenv("STARPU_SCHED"))
  263. conf.sched_policy_name = "dmdas";
  264. int ret = starpu_initialize(&conf, &argc, &argv);
  265. if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
  266. STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
  267. unsigned N, k, m, n, iter, NITER;
  268. if (argc < 2)
  269. #ifdef STARPU_QUICK_CHECK
  270. N = 10;
  271. #else
  272. N = 40;
  273. #endif
  274. else
  275. N = atoi(argv[1]);
  276. if (argc < 3)
  277. #ifdef STARPU_QUICK_CHECK
  278. NITER = 3;
  279. #else
  280. NITER = 10;
  281. #endif
  282. else
  283. NITER = atoi(argv[2]);
  284. if (N == 0)
  285. {
  286. starpu_shutdown();
  287. return 0;
  288. }
  289. /* Give parameter summary to user */
  290. printf("freqs (MHz):\n");
  291. printf("%f %f %f\n", freq_min, freq_slow, freq_fast);
  292. printf("\n");
  293. printf("per-core power (W):\n");
  294. printf("%f %f\n", power_min, power_fast);
  295. printf("%f %f %f\n", power(freq_min), power(freq_slow), power(freq_fast));
  296. printf("\n");
  297. printf("kernel perfs in GFlops (min, slow, fast):\n");
  298. printf("gemm:\t%f %f %f\n",
  299. GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_min) / 1000,
  300. GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_slow) / 1000,
  301. GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_fast) / 1000);
  302. printf("trsm:\t%f %f %f\n",
  303. TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_min) / 1000,
  304. TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_slow) / 1000,
  305. TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_fast) / 1000);
  306. printf("potrf:\t%f %f %f\n",
  307. POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_min) / 1000,
  308. POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_slow) / 1000,
  309. POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_fast) / 1000);
  310. printf("\n");
  311. printf("kernel efficiency in GFlops/W (min, slow, fast):\n");
  312. printf("gemm:\t%f %f %f\n",
  313. GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_min) / 1000 / power(freq_min),
  314. GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_slow) / 1000 / power(freq_slow),
  315. GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_fast) / 1000 / power(freq_fast));
  316. printf("trsm:\t%f %f %f\n",
  317. TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_min) / 1000 / power(freq_min),
  318. TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_slow) / 1000 / power(freq_slow),
  319. TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_fast) / 1000 / power(freq_fast));
  320. printf("potrf:\t%f %f %f\n",
  321. POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_min) / 1000 / power(freq_min),
  322. POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_slow) / 1000 / power(freq_slow),
  323. POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_fast) / 1000 / power(freq_fast));
  324. printf("\n");
  325. /* Now compute */
  326. starpu_data_handle_t A[N][N];
  327. for (m = 0; m < N; m++)
  328. for (n = 0; n < N; n++)
  329. starpu_void_data_register(&A[m][n]);
  330. unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
  331. double timing_sum = 0.;
  332. double energy_sum = 0.;
  333. double timing_sum2 = 0.;
  334. double energy_sum2 = 0.;
  335. for (iter = 0; iter < NITER; iter++)
  336. {
  337. double start = starpu_timing_now();
  338. double start_energy = starpu_energy_used();
  339. for (k = 0; k < N; k++)
  340. {
  341. starpu_iteration_push(k);
  342. ret = starpu_task_insert(&potrf_cl,
  343. STARPU_PRIORITY, unbound_prio ? (int)(2*N - 2*k) : STARPU_MAX_PRIO,
  344. STARPU_RW, A[k][k],
  345. STARPU_FLOPS, (double) FLOPS_SPOTRF(TILE_SIZE),
  346. STARPU_TAG_ONLY, TAG11(k),
  347. 0);
  348. if (ret == -ENODEV) return 77;
  349. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
  350. for (m = k+1; m<N; m++)
  351. {
  352. ret = starpu_task_insert(&trsm_cl,
  353. STARPU_PRIORITY, unbound_prio ? (int)(2*N - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  354. STARPU_R, A[k][k],
  355. STARPU_RW, A[m][k],
  356. STARPU_FLOPS, (double) FLOPS_STRSM(TILE_SIZE, TILE_SIZE),
  357. STARPU_TAG_ONLY, TAG21(m,k),
  358. 0);
  359. if (ret == -ENODEV) return 77;
  360. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
  361. }
  362. for (m = k+1; m<N; m++)
  363. {
  364. for (n = k+1; n<N; n++)
  365. {
  366. if (n <= m)
  367. {
  368. ret = starpu_task_insert(&gemm_cl,
  369. STARPU_PRIORITY, unbound_prio ? (int)(2*N - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  370. STARPU_R, A[m][k],
  371. STARPU_R, A[n][k],
  372. gemm_cl.modes[2], A[m][n],
  373. STARPU_FLOPS, (double) FLOPS_SGEMM(TILE_SIZE, TILE_SIZE, TILE_SIZE),
  374. STARPU_TAG_ONLY, TAG22(k,m,n),
  375. 0);
  376. if (ret == -ENODEV) return 77;
  377. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
  378. }
  379. }
  380. }
  381. starpu_iteration_pop();
  382. }
  383. starpu_task_wait_for_all();
  384. double end = starpu_timing_now();
  385. double end_energy = starpu_energy_used();
  386. double timing = end - start;
  387. double energy = end_energy - start_energy;
  388. timing_sum += timing;
  389. timing_sum2 += timing*timing;
  390. energy_sum += energy;
  391. energy_sum2 += energy*energy;
  392. }
  393. /* Make stats and print */
  394. double timing_avg = timing_sum / NITER;
  395. double timing_dev = sqrt((fabs(timing_sum2 - (timing_sum*timing_sum)/NITER))/NITER);
  396. double energy_avg = energy_sum / NITER;
  397. double energy_dev = sqrt((fabs(energy_sum2 - (energy_sum*energy_sum)/NITER))/NITER);
  398. double flop = FLOPS_SPOTRF(TILE_SIZE * N);
  399. unsigned toprint_slow;
  400. if (ncpu_slow >= 0)
  401. toprint_slow = ncpu_slow;
  402. else
  403. toprint_slow = freq_slow;
  404. printf("# size\t%s\tms +-\tGFlop/s +-\ten. (J) +-\tGF/W\n",
  405. ncpu_slow >= 0 ? "nslow" : "fslow");
  406. printf("%u\t%u\t%.0f %.1f\t%.1f %.1f\t%.1f %.1f\t%.2f\n",
  407. TILE_SIZE * N,
  408. toprint_slow,
  409. timing_avg/1000,
  410. timing_dev/1000,
  411. (flop/timing_avg/1000.0f),
  412. (flop/(timing_avg*timing_avg)/1000.f)*timing_dev,
  413. energy_avg, energy_dev,
  414. flop/1000000000./energy_avg);
  415. for (m = 0; m < N; m++)
  416. for (n = 0; n < N; n++)
  417. starpu_data_unregister(A[m][n]);
  418. out:
  419. starpu_shutdown();
  420. return 0;
  421. }