cuda-bandwith.c 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. /*
  2. * StarPU
  3. * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #define _GNU_SOURCE
  17. #include <sched.h>
  18. #include <cuda.h>
  19. #include <cublas.h>
  20. #include <cblas.h>
  21. #include <string.h>
  22. #include <stdio.h>
  23. #include <stdlib.h>
  24. #include <stdint.h>
  25. #include <pthread.h>
  26. #include <assert.h>
  27. #include <sys/time.h>
  28. int GPU_LD = 2048;
  29. int CPU_LD = 2048;
  30. int MATRIXSIZE = 1024;
  31. int pinned = 0;
  32. int htod = 0;
  33. int ITER = 100;
  34. #define CPUBUFFERSIZE (4*CPU_LD*CPU_LD)
  35. #define GPUBUFFERSIZE (4*GPU_LD*GPU_LD)
  36. float *h_A;
  37. void * d_A;
  38. float *A, *B, *C;
  39. unsigned cuda_initialized = 0;
  40. static void parse_args(int argc, char **argv)
  41. {
  42. int i;
  43. for (i = 1; i < argc; i++) {
  44. if (strcmp(argv[i], "-gpu-ld") == 0) {
  45. char *argptr;
  46. GPU_LD = strtol(argv[++i], &argptr, 10);
  47. }
  48. if (strcmp(argv[i], "-cpu-ld") == 0) {
  49. char *argptr;
  50. CPU_LD = strtol(argv[++i], &argptr, 10);
  51. }
  52. if (strcmp(argv[i], "-size") == 0) {
  53. char *argptr;
  54. MATRIXSIZE = strtol(argv[++i], &argptr, 10);
  55. }
  56. if (strcmp(argv[i], "-pin") == 0) {
  57. pinned = 1;
  58. }
  59. if (strcmp(argv[i], "-HtoD") == 0) {
  60. htod = 1;
  61. }
  62. if (strcmp(argv[i], "-iter") == 0) {
  63. char *argptr;
  64. ITER = strtol(argv[++i], &argptr, 10);
  65. }
  66. if (strcmp(argv[i], "-h") == 0) {
  67. printf("usage : %s [-pin] [-HtoD] [-size size] [-cpu-ld ld] [-gpu-ld ld] [-iter n]\n", argv[0]);
  68. }
  69. }
  70. assert(CPU_LD >= MATRIXSIZE);
  71. assert(GPU_LD >= MATRIXSIZE);
  72. }
  73. void bind_thread(int cpu)
  74. {
  75. /* bind the thread to a cpu */
  76. cpu_set_t mask;
  77. CPU_ZERO(&mask);
  78. CPU_SET(cpu, &mask);
  79. sched_setaffinity(0, sizeof(cpu_set_t), &mask);
  80. }
  81. void benchmark_memcpy(void)
  82. {
  83. unsigned count;
  84. struct timeval tv_start, tv_end;
  85. unsigned long long usecs;
  86. double bytes = 4.0*MATRIXSIZE*MATRIXSIZE*ITER;
  87. cublasInit();
  88. if (pinned)
  89. {
  90. cuMemAllocHost((void **)&h_A, CPUBUFFERSIZE);
  91. }
  92. else
  93. {
  94. h_A = malloc(CPUBUFFERSIZE);
  95. }
  96. assert(h_A);
  97. /* malloc a buffer on the device */
  98. cublasAlloc(GPU_LD*GPU_LD, sizeof(float), &d_A);
  99. assert(d_A);
  100. gettimeofday(&tv_start, NULL);
  101. if (!pinned)
  102. {
  103. /* pageable memory */
  104. if (!htod)
  105. {
  106. for (count = 0; count < ITER; count++)
  107. {
  108. cublasGetMatrix(MATRIXSIZE, MATRIXSIZE, sizeof(float),
  109. (void *)d_A, GPU_LD, h_A, CPU_LD);
  110. cuCtxSynchronize();
  111. }
  112. }
  113. else
  114. {
  115. for (count = 0; count < ITER; count++)
  116. {
  117. cublasSetMatrix(MATRIXSIZE, MATRIXSIZE, sizeof(float),
  118. h_A, CPU_LD, (void *)d_A, GPU_LD);
  119. cuCtxSynchronize();
  120. }
  121. }
  122. }
  123. else
  124. {
  125. if (!htod)
  126. {
  127. for (count = 0; count < ITER; count++)
  128. {
  129. cublasGetMatrix(MATRIXSIZE, MATRIXSIZE, sizeof(float),
  130. d_A, GPU_LD, h_A, CPU_LD);
  131. cuCtxSynchronize();
  132. }
  133. }
  134. else {
  135. for (count = 0; count < ITER; count++)
  136. {
  137. cublasSetMatrix(MATRIXSIZE, MATRIXSIZE, sizeof(float),
  138. h_A, CPU_LD, d_A, GPU_LD);
  139. cuCtxSynchronize();
  140. }
  141. }
  142. }
  143. gettimeofday(&tv_end, NULL);
  144. usecs = (tv_end.tv_usec - tv_start.tv_usec) + 1000000*(tv_end.tv_sec - tv_start.tv_sec);
  145. printf("%2.2f\n", bytes/usecs);
  146. if (pinned)
  147. {
  148. cuMemFreeHost(&h_A);
  149. }
  150. else
  151. {
  152. free(h_A);
  153. }
  154. }
  155. int main(int argc, char **argv)
  156. {
  157. parse_args(argc, argv);
  158. bind_thread(0);
  159. // printf("Memcpy alone\n");
  160. benchmark_memcpy();
  161. return 0;
  162. }