cuda_bandwidth.c 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #ifndef _GNU_SOURCE
  17. #define _GNU_SOURCE 1
  18. #endif
  19. #include <sched.h>
  20. #include <cuda.h>
  21. #include <cublas.h>
  22. #include <cblas.h>
  23. #include <string.h>
  24. #include <stdio.h>
  25. #include <stdlib.h>
  26. #include <stdint.h>
  27. #include <pthread.h>
  28. #include <assert.h>
  29. int GPU_LD = 2048;
  30. int CPU_LD = 2048;
  31. int MATRIXSIZE = 1024;
  32. int pinned = 0;
  33. int htod = 0;
  34. int ITER = 100;
  35. #define CPUBUFFERSIZE (4*CPU_LD*CPU_LD)
  36. #define GPUBUFFERSIZE (4*GPU_LD*GPU_LD)
  37. float *h_A;
  38. void * d_A;
  39. float *A, *B, *C;
  40. unsigned cuda_initialized = 0;
  41. static void parse_args(int argc, char **argv)
  42. {
  43. int i;
  44. for (i = 1; i < argc; i++)
  45. {
  46. if (strcmp(argv[i], "-gpu-ld") == 0)
  47. {
  48. char *argptr;
  49. GPU_LD = strtol(argv[++i], &argptr, 10);
  50. }
  51. if (strcmp(argv[i], "-cpu-ld") == 0)
  52. {
  53. char *argptr;
  54. CPU_LD = strtol(argv[++i], &argptr, 10);
  55. }
  56. if (strcmp(argv[i], "-size") == 0)
  57. {
  58. char *argptr;
  59. MATRIXSIZE = strtol(argv[++i], &argptr, 10);
  60. }
  61. if (strcmp(argv[i], "-pin") == 0)
  62. {
  63. pinned = 1;
  64. }
  65. if (strcmp(argv[i], "-HtoD") == 0)
  66. {
  67. htod = 1;
  68. }
  69. if (strcmp(argv[i], "-iter") == 0)
  70. {
  71. char *argptr;
  72. ITER = strtol(argv[++i], &argptr, 10);
  73. }
  74. if (strcmp(argv[i], "-h") == 0)
  75. {
  76. printf("usage : %s [-pin] [-HtoD] [-size size] [-cpu-ld ld] [-gpu-ld ld] [-iter n]\n", argv[0]);
  77. }
  78. }
  79. STARPU_ASSERT(CPU_LD >= MATRIXSIZE);
  80. STARPU_ASSERT(GPU_LD >= MATRIXSIZE);
  81. }
  82. void bind_thread(int cpu)
  83. {
  84. /* bind the thread to a cpu */
  85. cpu_set_t mask;
  86. CPU_ZERO(&mask);
  87. CPU_SET(cpu, &mask);
  88. sched_setaffinity(0, sizeof(cpu_set_t), &mask);
  89. }
  90. void benchmark_memcpy(void)
  91. {
  92. unsigned count;
  93. double tv_start, tv_end;
  94. unsigned long long usecs;
  95. double bytes = 4.0*MATRIXSIZE*MATRIXSIZE*ITER;
  96. cublasInit();
  97. if (pinned)
  98. {
  99. cuMemAllocHost((void **)&h_A, CPUBUFFERSIZE);
  100. }
  101. else
  102. {
  103. h_A = malloc(CPUBUFFERSIZE);
  104. }
  105. STARPU_ASSERT(h_A);
  106. /* malloc a buffer on the device */
  107. cublasAlloc(GPU_LD*GPU_LD, sizeof(float), &d_A);
  108. STARPU_ASSERT(d_A);
  109. tv_start = starpu_timing_now();
  110. if (!pinned)
  111. {
  112. /* pageable memory */
  113. if (!htod)
  114. {
  115. for (count = 0; count < ITER; count++)
  116. {
  117. cublasGetMatrix(MATRIXSIZE, MATRIXSIZE, sizeof(float),
  118. (void *)d_A, GPU_LD, h_A, CPU_LD);
  119. cuCtxSynchronize();
  120. }
  121. }
  122. else
  123. {
  124. for (count = 0; count < ITER; count++)
  125. {
  126. cublasSetMatrix(MATRIXSIZE, MATRIXSIZE, sizeof(float),
  127. h_A, CPU_LD, (void *)d_A, GPU_LD);
  128. cuCtxSynchronize();
  129. }
  130. }
  131. }
  132. else
  133. {
  134. if (!htod)
  135. {
  136. for (count = 0; count < ITER; count++)
  137. {
  138. cublasGetMatrix(MATRIXSIZE, MATRIXSIZE, sizeof(float),
  139. d_A, GPU_LD, h_A, CPU_LD);
  140. cuCtxSynchronize();
  141. }
  142. }
  143. else
  144. {
  145. for (count = 0; count < ITER; count++)
  146. {
  147. cublasSetMatrix(MATRIXSIZE, MATRIXSIZE, sizeof(float),
  148. h_A, CPU_LD, d_A, GPU_LD);
  149. cuCtxSynchronize();
  150. }
  151. }
  152. }
  153. tv_end = starpu_timing_now();
  154. usecs = tv_end - tv_start;
  155. printf("%2.2f\n", bytes/usecs);
  156. if (pinned)
  157. {
  158. cuMemFreeHost(&h_A);
  159. }
  160. else
  161. {
  162. free(h_A);
  163. }
  164. }
  165. int main(int argc, char **argv)
  166. {
  167. parse_args(argc, argv);
  168. bind_thread(0);
  169. // printf("Memcpy alone\n");
  170. benchmark_memcpy();
  171. return EXIT_SUCCESS;
  172. }