perfmodel_bus.c 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. /*
  2. * StarPU
  3. * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include <unistd.h>
  17. #include <sys/time.h>
  18. #include <starpu.h>
  19. #include <common/config.h>
  20. #include <core/perfmodel/perfmodel.h>
  21. #include <datawizard/data_parameters.h>
  22. #define SIZE (32*1024*1024*sizeof(char))
  23. #define NITER 128
  24. static double bandwith_matrix[MAXNODES][MAXNODES] = {{-1.0}};
  25. static double latency_matrix[MAXNODES][MAXNODES] = {{ -1.0}};
  26. /* Benchmarking the performance of the bus */
  27. static double cudadev_timing_htod[MAXNODES] = {0.0};
  28. static double cudadev_timing_dtoh[MAXNODES] = {0.0};
  29. static int ncuda;
  30. static unsigned was_benchmarked = 0;
  31. static void measure_bandwith_between_host_and_dev(int dev)
  32. {
  33. /* Initiliaze CUDA context on the device */
  34. cudaSetDevice(dev);
  35. /* hack to force the initialization */
  36. cudaFree(0);
  37. /* Allocate a buffer on the device */
  38. unsigned char *d_buffer;
  39. cudaMalloc((void **)&d_buffer, SIZE);
  40. assert(d_buffer);
  41. /* Allocate a buffer on the host */
  42. unsigned char *h_buffer;
  43. cudaHostAlloc((void **)&h_buffer, SIZE, 0);
  44. assert(h_buffer);
  45. /* Fill them */
  46. memset(h_buffer, 0, SIZE);
  47. cudaMemset(d_buffer, 0, SIZE);
  48. unsigned iter;
  49. double timing;
  50. struct timeval start;
  51. struct timeval end;
  52. /* Measure upload bandwith */
  53. gettimeofday(&start, NULL);
  54. for (iter = 0; iter < NITER; iter++)
  55. {
  56. cudaMemcpy(d_buffer, h_buffer, SIZE, cudaMemcpyHostToDevice);
  57. cudaThreadSynchronize();
  58. }
  59. gettimeofday(&end, NULL);
  60. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  61. cudadev_timing_htod[dev+1] = timing/NITER;
  62. /* Measure download bandwith */
  63. gettimeofday(&start, NULL);
  64. for (iter = 0; iter < NITER; iter++)
  65. {
  66. cudaMemcpy(h_buffer, d_buffer, SIZE, cudaMemcpyDeviceToHost);
  67. cudaThreadSynchronize();
  68. }
  69. gettimeofday(&end, NULL);
  70. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  71. cudadev_timing_dtoh[dev+1] = timing/NITER;
  72. /* Free buffers */
  73. cudaFreeHost(h_buffer);
  74. cudaFree(d_buffer);
  75. cudaThreadExit();
  76. }
  77. static void benchmark_all_cuda_devices(void)
  78. {
  79. #ifdef VERBOSE
  80. fprintf(stderr, "Benchmarking the speed of the bus\n");
  81. #endif
  82. cudaGetDeviceCount(&ncuda);
  83. int i;
  84. for (i = 0; i < ncuda; i++)
  85. {
  86. /* measure bandwith between Host and Device i */
  87. measure_bandwith_between_host_and_dev(i);
  88. }
  89. was_benchmarked = 1;
  90. #ifdef VERBOSE
  91. fprintf(stderr, "Benchmarking the speed of the bus is done.\n");
  92. #endif
  93. }
  94. static void get_bus_path(const char *type, char *path, size_t maxlen)
  95. {
  96. strncpy(path, PERF_MODEL_DIR_BUS, maxlen);
  97. strncat(path, type, maxlen);
  98. char hostname[32];
  99. gethostname(hostname, 32);
  100. strncat(path, ".", maxlen);
  101. strncat(path, hostname, maxlen);
  102. }
  103. /*
  104. * Latency
  105. */
  106. static void get_latency_path(char *path, size_t maxlen)
  107. {
  108. get_bus_path("latency", path, maxlen);
  109. }
  110. static void load_bus_latency_file_content(void)
  111. {
  112. int n;
  113. unsigned src, dst;
  114. FILE *f;
  115. char path[256];
  116. get_latency_path(path, 256);
  117. f = fopen(path, "r");
  118. STARPU_ASSERT(f);
  119. for (src = 0; src < MAXNODES; src++)
  120. {
  121. for (dst = 0; dst < MAXNODES; dst++)
  122. {
  123. double latency;
  124. n = fscanf(f, "%lf ", &latency);
  125. STARPU_ASSERT(n == 1);
  126. latency_matrix[src][dst] = latency;
  127. }
  128. n = fscanf(f, "\n");
  129. STARPU_ASSERT(n == 0);
  130. }
  131. fclose(f);
  132. }
  133. static void write_bus_latency_file_content(void)
  134. {
  135. int src, dst;
  136. FILE *f;
  137. STARPU_ASSERT(was_benchmarked);
  138. char path[256];
  139. get_latency_path(path, 256);
  140. f = fopen(path, "w+");
  141. STARPU_ASSERT(f);
  142. for (src = 0; src < MAXNODES; src++)
  143. {
  144. for (dst = 0; dst < MAXNODES; dst++)
  145. {
  146. double latency;
  147. if ((src > ncuda) || (dst > ncuda))
  148. {
  149. /* convention */
  150. latency = -1.0;
  151. }
  152. else if (src == dst)
  153. {
  154. latency = 0.0;
  155. }
  156. else {
  157. latency = ((src && dst)?2000.0:500.0);
  158. }
  159. fprintf(f, "%lf ", latency);
  160. }
  161. fprintf(f, "\n");
  162. }
  163. fclose(f);
  164. }
  165. static void load_bus_latency_file(void)
  166. {
  167. int res;
  168. char path[256];
  169. get_latency_path(path, 256);
  170. res = access(path, F_OK);
  171. if (res)
  172. {
  173. /* File does not exist yet */
  174. if (!was_benchmarked)
  175. benchmark_all_cuda_devices();
  176. write_bus_latency_file_content();
  177. }
  178. load_bus_latency_file_content();
  179. }
  180. /*
  181. * Bandwith
  182. */
  183. static void get_bandwith_path(char *path, size_t maxlen)
  184. {
  185. get_bus_path("bandwith", path, maxlen);
  186. }
  187. static void load_bus_bandwith_file_content(void)
  188. {
  189. int n;
  190. unsigned src, dst;
  191. FILE *f;
  192. char path[256];
  193. get_bandwith_path(path, 256);
  194. f = fopen(path, "r");
  195. STARPU_ASSERT(f);
  196. for (src = 0; src < MAXNODES; src++)
  197. {
  198. for (dst = 0; dst < MAXNODES; dst++)
  199. {
  200. double bandwith;
  201. n = fscanf(f, "%lf ", &bandwith);
  202. STARPU_ASSERT(n == 1);
  203. bandwith_matrix[src][dst] = bandwith;
  204. }
  205. n = fscanf(f, "\n");
  206. STARPU_ASSERT(n == 0);
  207. }
  208. fclose(f);
  209. }
  210. static void write_bus_bandwith_file_content(void)
  211. {
  212. int src, dst;
  213. FILE *f;
  214. STARPU_ASSERT(was_benchmarked);
  215. char path[256];
  216. get_bandwith_path(path, 256);
  217. f = fopen(path, "w+");
  218. STARPU_ASSERT(f);
  219. for (src = 0; src < MAXNODES; src++)
  220. {
  221. for (dst = 0; dst < MAXNODES; dst++)
  222. {
  223. double bandwith;
  224. if ((src > ncuda) || (dst > ncuda))
  225. {
  226. bandwith = -1.0;
  227. }
  228. else if (src != dst)
  229. {
  230. /* Bandwith = (SIZE)/(time i -> ram + time ram -> j)*/
  231. double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
  232. double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
  233. double timing =time_src_to_ram + time_ram_to_dst;
  234. bandwith = 1.0*SIZE/timing;
  235. }
  236. else {
  237. /* convention */
  238. bandwith = 0.0;
  239. }
  240. fprintf(f, "%lf ", bandwith);
  241. }
  242. fprintf(f, "\n");
  243. }
  244. fclose(f);
  245. }
  246. static void load_bus_bandwith_file(void)
  247. {
  248. int res;
  249. char path[256];
  250. get_bandwith_path(path, 256);
  251. res = access(path, F_OK);
  252. if (res)
  253. {
  254. /* File does not exist yet */
  255. if (!was_benchmarked)
  256. benchmark_all_cuda_devices();
  257. write_bus_bandwith_file_content();
  258. }
  259. load_bus_bandwith_file_content();
  260. }
  261. /*
  262. * Generic
  263. */
  264. void load_bus_performance_files(void)
  265. {
  266. load_bus_latency_file();
  267. load_bus_bandwith_file();
  268. }
  269. double predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
  270. {
  271. double bandwith = bandwith_matrix[src_node][dst_node];
  272. double latency = latency_matrix[src_node][dst_node];
  273. return latency + size/bandwith;
  274. }