perfmodel_bus.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. /*
  2. * StarPU
  3. * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include <unistd.h>
  17. #include <sys/time.h>
  18. #include <stdlib.h>
  19. #include <starpu.h>
  20. #include <common/config.h>
  21. #include <core/workers.h>
  22. #include <core/perfmodel/perfmodel.h>
  23. #define SIZE (32*1024*1024*sizeof(char))
  24. #define NITER 128
  25. #define MAXCPUS 32
  26. struct cudadev_timing {
  27. int cpu_id;
  28. double timing_htod;
  29. double timing_dtoh;
  30. };
  31. static double bandwith_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{-1.0}};
  32. static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{ -1.0}};
  33. static unsigned was_benchmarked = 0;
  34. static int ncuda = 0;
  35. static int affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
  36. /* Benchmarking the performance of the bus */
  37. #ifdef USE_CUDA
  38. static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
  39. static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
  40. static struct cudadev_timing cudadev_timing_per_cpu[STARPU_MAXNODES][MAXCPUS];
  41. static void measure_bandwith_between_host_and_dev_on_cpu(int dev, int cpu)
  42. {
  43. struct machine_config_s *config = _starpu_get_machine_config();
  44. _starpu_bind_thread_on_cpu(config, cpu);
  45. /* Initiliaze CUDA context on the device */
  46. cudaSetDevice(dev);
  47. /* hack to avoid third party libs to rebind threads */
  48. _starpu_bind_thread_on_cpu(config, cpu);
  49. /* hack to force the initialization */
  50. cudaFree(0);
  51. /* hack to avoid third party libs to rebind threads */
  52. _starpu_bind_thread_on_cpu(config, cpu);
  53. /* Allocate a buffer on the device */
  54. unsigned char *d_buffer;
  55. cudaMalloc((void **)&d_buffer, SIZE);
  56. assert(d_buffer);
  57. /* hack to avoid third party libs to rebind threads */
  58. _starpu_bind_thread_on_cpu(config, cpu);
  59. /* Allocate a buffer on the host */
  60. unsigned char *h_buffer;
  61. cudaHostAlloc((void **)&h_buffer, SIZE, 0);
  62. assert(h_buffer);
  63. /* hack to avoid third party libs to rebind threads */
  64. _starpu_bind_thread_on_cpu(config, cpu);
  65. /* Fill them */
  66. memset(h_buffer, 0, SIZE);
  67. cudaMemset(d_buffer, 0, SIZE);
  68. /* hack to avoid third party libs to rebind threads */
  69. _starpu_bind_thread_on_cpu(config, cpu);
  70. unsigned iter;
  71. double timing;
  72. struct timeval start;
  73. struct timeval end;
  74. cudadev_timing_per_cpu[dev+1][cpu].cpu_id = cpu;
  75. /* Measure upload bandwith */
  76. gettimeofday(&start, NULL);
  77. for (iter = 0; iter < NITER; iter++)
  78. {
  79. cudaMemcpy(d_buffer, h_buffer, SIZE, cudaMemcpyHostToDevice);
  80. cudaThreadSynchronize();
  81. }
  82. gettimeofday(&end, NULL);
  83. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  84. cudadev_timing_per_cpu[dev+1][cpu].timing_htod = timing/NITER;
  85. /* Measure download bandwith */
  86. gettimeofday(&start, NULL);
  87. for (iter = 0; iter < NITER; iter++)
  88. {
  89. cudaMemcpy(h_buffer, d_buffer, SIZE, cudaMemcpyDeviceToHost);
  90. cudaThreadSynchronize();
  91. }
  92. gettimeofday(&end, NULL);
  93. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  94. cudadev_timing_per_cpu[dev+1][cpu].timing_dtoh = timing/NITER;
  95. /* Free buffers */
  96. cudaFreeHost(h_buffer);
  97. cudaFree(d_buffer);
  98. cudaThreadExit();
  99. }
  100. /* NB: we want to sort the bandwith by DECREASING order */
  101. int compar_cudadev_timing(const void *left_cudadev_timing, const void *right_cudadev_timing)
  102. {
  103. const struct cudadev_timing *left = left_cudadev_timing;
  104. const struct cudadev_timing *right = right_cudadev_timing;
  105. double left_dtoh = left->timing_dtoh;
  106. double left_htod = left->timing_htod;
  107. double right_dtoh = right->timing_dtoh;
  108. double right_htod = right->timing_htod;
  109. double bandwith_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
  110. double bandwith_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
  111. /* it's for a decreasing sorting */
  112. return (bandwith_sum2_left < bandwith_sum2_right);
  113. }
  114. static void measure_bandwith_between_host_and_dev(int dev, unsigned ncores)
  115. {
  116. unsigned core;
  117. for (core = 0; core < ncores; core++)
  118. {
  119. measure_bandwith_between_host_and_dev_on_cpu(dev, core);
  120. }
  121. /* sort the results */
  122. qsort(cudadev_timing_per_cpu[dev+1], ncores,
  123. sizeof(struct cudadev_timing),
  124. compar_cudadev_timing);
  125. #ifdef VERBOSE
  126. for (core = 0; core < ncores; core++)
  127. {
  128. unsigned current_core = cudadev_timing_per_cpu[dev+1][core].cpu_id;
  129. double bandwith_dtoh = cudadev_timing_per_cpu[dev+1][core].timing_dtoh;
  130. double bandwith_htod = cudadev_timing_per_cpu[dev+1][core].timing_htod;
  131. double bandwith_sum2 = bandwith_dtoh*bandwith_dtoh + bandwith_htod*bandwith_htod;
  132. fprintf(stderr, "BANDWITH GPU %d CPU %d - htod %lf - dtoh %lf - %lf\n", dev, current_core, bandwith_htod, bandwith_dtoh, sqrt(bandwith_sum2));
  133. }
  134. unsigned best_core = cudadev_timing_per_cpu[dev+1][0].cpu_id;
  135. fprintf(stderr, "BANDWITH GPU %d BEST CPU %d\n", dev, best_core);
  136. #endif
  137. /* The results are sorted in a decreasing order, so that the best
  138. * measurement is currently the first entry. */
  139. cudadev_timing_dtoh[dev+1] = cudadev_timing_per_cpu[dev+1][0].timing_dtoh;
  140. cudadev_timing_htod[dev+1] = cudadev_timing_per_cpu[dev+1][0].timing_htod;
  141. }
  142. #endif
  143. static void benchmark_all_cuda_devices(void)
  144. {
  145. int ret;
  146. #ifdef VERBOSE
  147. fprintf(stderr, "Benchmarking the speed of the bus\n");
  148. #endif
  149. /* Save the current cpu binding */
  150. cpu_set_t former_process_affinity;
  151. ret = sched_getaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  152. if (ret)
  153. {
  154. perror("sched_getaffinity");
  155. STARPU_ABORT();
  156. }
  157. #ifdef USE_CUDA
  158. struct machine_config_s *config = _starpu_get_machine_config();
  159. unsigned ncores = _starpu_topology_get_nhwcore(config);
  160. cudaGetDeviceCount(&ncuda);
  161. int i;
  162. for (i = 0; i < ncuda; i++)
  163. {
  164. /* measure bandwith between Host and Device i */
  165. measure_bandwith_between_host_and_dev(i, ncores);
  166. }
  167. #endif
  168. was_benchmarked = 1;
  169. /* Restore the former affinity */
  170. ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  171. if (ret)
  172. {
  173. perror("sched_setaffinity");
  174. STARPU_ABORT();
  175. }
  176. #ifdef VERBOSE
  177. fprintf(stderr, "Benchmarking the speed of the bus is done.\n");
  178. #endif
  179. }
  180. static void get_bus_path(const char *type, char *path, size_t maxlen)
  181. {
  182. _starpu_get_perf_model_dir_bus(path, maxlen);
  183. strncat(path, type, maxlen);
  184. char hostname[32];
  185. gethostname(hostname, 32);
  186. strncat(path, ".", maxlen);
  187. strncat(path, hostname, maxlen);
  188. }
  189. /*
  190. * Affinity
  191. */
  192. static void get_affinity_path(char *path, size_t maxlen)
  193. {
  194. get_bus_path("affinity", path, maxlen);
  195. }
  196. static void load_bus_affinity_file_content(void)
  197. {
  198. FILE *f;
  199. char path[256];
  200. get_affinity_path(path, 256);
  201. f = fopen(path, "r");
  202. STARPU_ASSERT(f);
  203. #ifdef USE_CUDA
  204. struct machine_config_s *config = _starpu_get_machine_config();
  205. unsigned ncores = _starpu_topology_get_nhwcore(config);
  206. cudaGetDeviceCount(&ncuda);
  207. int gpu;
  208. for (gpu = 0; gpu < ncuda; gpu++)
  209. {
  210. int ret;
  211. int dummy;
  212. ret = fscanf(f, "%d\t", &dummy);
  213. STARPU_ASSERT(ret == 1);
  214. STARPU_ASSERT(dummy == gpu);
  215. unsigned core;
  216. for (core = 0; core < ncores; core++)
  217. {
  218. ret = fscanf(f, "%d\t", &affinity_matrix[gpu][core]);
  219. STARPU_ASSERT(ret == 1);
  220. }
  221. ret = fscanf(f, "\n");
  222. STARPU_ASSERT(ret == 0);
  223. }
  224. #endif
  225. fclose(f);
  226. }
  227. static void write_bus_affinity_file_content(void)
  228. {
  229. FILE *f;
  230. STARPU_ASSERT(was_benchmarked);
  231. char path[256];
  232. get_affinity_path(path, 256);
  233. f = fopen(path, "w+");
  234. if (!f)
  235. {
  236. perror("fopen");
  237. STARPU_ABORT();
  238. }
  239. #ifdef USE_CUDA
  240. struct machine_config_s *config = _starpu_get_machine_config();
  241. unsigned ncores = _starpu_topology_get_nhwcore(config);
  242. int gpu;
  243. for (gpu = 0; gpu < ncuda; gpu++)
  244. {
  245. fprintf(f, "%d\t", gpu);
  246. unsigned core;
  247. for (core = 0; core < ncores; core++)
  248. {
  249. fprintf(f, "%d\t", cudadev_timing_per_cpu[gpu+1][core].cpu_id);
  250. }
  251. fprintf(f, "\n");
  252. }
  253. #endif
  254. fclose(f);
  255. }
  256. static void generate_bus_affinity_file(void)
  257. {
  258. if (!was_benchmarked)
  259. benchmark_all_cuda_devices();
  260. write_bus_affinity_file_content();
  261. }
  262. static void load_bus_affinity_file(void)
  263. {
  264. int res;
  265. char path[256];
  266. get_affinity_path(path, 256);
  267. res = access(path, F_OK);
  268. if (res)
  269. {
  270. /* File does not exist yet */
  271. generate_bus_affinity_file();
  272. }
  273. load_bus_affinity_file_content();
  274. }
  275. int *get_gpu_affinity_vector(unsigned gpuid)
  276. {
  277. return affinity_matrix[gpuid];
  278. }
  279. /*
  280. * Latency
  281. */
  282. static void get_latency_path(char *path, size_t maxlen)
  283. {
  284. get_bus_path("latency", path, maxlen);
  285. }
  286. static void load_bus_latency_file_content(void)
  287. {
  288. int n;
  289. unsigned src, dst;
  290. FILE *f;
  291. char path[256];
  292. get_latency_path(path, 256);
  293. f = fopen(path, "r");
  294. STARPU_ASSERT(f);
  295. for (src = 0; src < STARPU_MAXNODES; src++)
  296. {
  297. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  298. {
  299. double latency;
  300. n = fscanf(f, "%lf ", &latency);
  301. STARPU_ASSERT(n == 1);
  302. latency_matrix[src][dst] = latency;
  303. }
  304. n = fscanf(f, "\n");
  305. STARPU_ASSERT(n == 0);
  306. }
  307. fclose(f);
  308. }
  309. static void write_bus_latency_file_content(void)
  310. {
  311. int src, dst;
  312. FILE *f;
  313. STARPU_ASSERT(was_benchmarked);
  314. char path[256];
  315. get_latency_path(path, 256);
  316. f = fopen(path, "w+");
  317. if (!f)
  318. {
  319. perror("fopen");
  320. STARPU_ABORT();
  321. }
  322. for (src = 0; src < STARPU_MAXNODES; src++)
  323. {
  324. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  325. {
  326. double latency;
  327. if ((src > ncuda) || (dst > ncuda))
  328. {
  329. /* convention */
  330. latency = -1.0;
  331. }
  332. else if (src == dst)
  333. {
  334. latency = 0.0;
  335. }
  336. else {
  337. latency = ((src && dst)?2000.0:500.0);
  338. }
  339. fprintf(f, "%lf ", latency);
  340. }
  341. fprintf(f, "\n");
  342. }
  343. fclose(f);
  344. }
  345. static void generate_bus_latency_file(void)
  346. {
  347. if (!was_benchmarked)
  348. benchmark_all_cuda_devices();
  349. write_bus_latency_file_content();
  350. }
  351. static void load_bus_latency_file(void)
  352. {
  353. int res;
  354. char path[256];
  355. get_latency_path(path, 256);
  356. res = access(path, F_OK);
  357. if (res)
  358. {
  359. /* File does not exist yet */
  360. generate_bus_latency_file();
  361. }
  362. load_bus_latency_file_content();
  363. }
  364. /*
  365. * Bandwith
  366. */
  367. static void get_bandwith_path(char *path, size_t maxlen)
  368. {
  369. get_bus_path("bandwith", path, maxlen);
  370. }
  371. static void load_bus_bandwith_file_content(void)
  372. {
  373. int n;
  374. unsigned src, dst;
  375. FILE *f;
  376. char path[256];
  377. get_bandwith_path(path, 256);
  378. f = fopen(path, "r");
  379. if (!f)
  380. {
  381. perror("fopen");
  382. STARPU_ABORT();
  383. }
  384. for (src = 0; src < STARPU_MAXNODES; src++)
  385. {
  386. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  387. {
  388. double bandwith;
  389. n = fscanf(f, "%lf ", &bandwith);
  390. STARPU_ASSERT(n == 1);
  391. bandwith_matrix[src][dst] = bandwith;
  392. }
  393. n = fscanf(f, "\n");
  394. STARPU_ASSERT(n == 0);
  395. }
  396. fclose(f);
  397. }
  398. static void write_bus_bandwith_file_content(void)
  399. {
  400. int src, dst;
  401. FILE *f;
  402. STARPU_ASSERT(was_benchmarked);
  403. char path[256];
  404. get_bandwith_path(path, 256);
  405. f = fopen(path, "w+");
  406. STARPU_ASSERT(f);
  407. for (src = 0; src < STARPU_MAXNODES; src++)
  408. {
  409. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  410. {
  411. double bandwith;
  412. if ((src > ncuda) || (dst > ncuda))
  413. {
  414. bandwith = -1.0;
  415. }
  416. #ifdef USE_CUDA
  417. else if (src != dst)
  418. {
  419. /* Bandwith = (SIZE)/(time i -> ram + time ram -> j)*/
  420. double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
  421. double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
  422. double timing =time_src_to_ram + time_ram_to_dst;
  423. bandwith = 1.0*SIZE/timing;
  424. }
  425. #endif
  426. else {
  427. /* convention */
  428. bandwith = 0.0;
  429. }
  430. fprintf(f, "%lf ", bandwith);
  431. }
  432. fprintf(f, "\n");
  433. }
  434. fclose(f);
  435. }
  436. static void generate_bus_bandwith_file(void)
  437. {
  438. if (!was_benchmarked)
  439. benchmark_all_cuda_devices();
  440. write_bus_bandwith_file_content();
  441. }
  442. static void load_bus_bandwith_file(void)
  443. {
  444. int res;
  445. char path[256];
  446. get_bandwith_path(path, 256);
  447. res = access(path, F_OK);
  448. if (res)
  449. {
  450. /* File does not exist yet */
  451. generate_bus_bandwith_file();
  452. }
  453. load_bus_bandwith_file_content();
  454. }
  455. /*
  456. * Generic
  457. */
  458. void starpu_force_bus_sampling(void)
  459. {
  460. create_sampling_directory_if_needed();
  461. generate_bus_affinity_file();
  462. generate_bus_latency_file();
  463. generate_bus_bandwith_file();
  464. }
  465. void load_bus_performance_files(void)
  466. {
  467. create_sampling_directory_if_needed();
  468. load_bus_affinity_file();
  469. load_bus_latency_file();
  470. load_bus_bandwith_file();
  471. }
  472. double predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
  473. {
  474. double bandwith = bandwith_matrix[src_node][dst_node];
  475. double latency = latency_matrix[src_node][dst_node];
  476. return latency + size/bandwith;
  477. }