cuda_latency.c 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010,2011,2013,2015-2017 CNRS
  4. * Copyright (C) 2010-2012,2014 Université de Bordeaux
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <stdio.h>
  18. #include <cuda.h>
  19. #include <cuda_runtime.h>
  20. #include <assert.h>
  21. #include <sys/types.h>
  22. static starpu_pthread_t thread[2];
  23. static unsigned thread_is_initialized[2];
  24. static starpu_pthread_cond_t cond;
  25. static starpu_pthread_mutex_t mutex;
  26. static size_t buffer_size = 4;
  27. static void *cpu_buffer;
  28. static void *gpu_buffer[2];
  29. static starpu_pthread_cond_t cond_go;
  30. static unsigned ready = 0;
  31. static unsigned nready_gpu = 0;
  32. static unsigned niter = 250000;
  33. static starpu_pthread_cond_t cond_gpu;
  34. static starpu_pthread_mutex_t mutex_gpu;
  35. static unsigned data_is_available[2];
  36. static cudaStream_t stream[2];
  37. #define ASYNC 1
  38. #define DO_TRANSFER_GPU_TO_RAM 1
  39. #define DO_TRANSFER_RAM_TO_GPU 1
  40. void send_data(unsigned src, unsigned dst)
  41. {
  42. cudaError_t cures;
  43. /* Copy data from GPU to RAM */
  44. #ifdef DO_TRANSFER_GPU_TO_RAM
  45. #ifdef ASYNC
  46. cures = cudaMemcpyAsync(cpu_buffer, gpu_buffer[src], buffer_size, cudaMemcpyDeviceToHost, stream[src]);
  47. STARPU_ASSERT(!cures);
  48. cures = cudaStreamSynchronize(stream[src]);
  49. STARPU_ASSERT(!cures);
  50. #else
  51. cures = cudaMemcpy(cpu_buffer, gpu_buffer[src], buffer_size, cudaMemcpyDeviceToHost);
  52. STARPU_ASSERT(!cures);
  53. cures = cudaThreadSynchronize();
  54. STARPU_ASSERT(!cures);
  55. #endif
  56. #endif
  57. /* Tell the other GPU that data is in RAM */
  58. STARPU_PTHREAD_MUTEX_LOCK(&mutex_gpu);
  59. data_is_available[src] = 0;
  60. data_is_available[dst] = 1;
  61. STARPU_PTHREAD_COND_SIGNAL(&cond_gpu);
  62. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_gpu);
  63. //fprintf(stderr, "SEND on %d\n", src);
  64. }
  65. void recv_data(unsigned src, unsigned dst)
  66. {
  67. cudaError_t cures;
  68. /* Wait for the data to be in RAM */
  69. STARPU_PTHREAD_MUTEX_LOCK(&mutex_gpu);
  70. while (!data_is_available[dst])
  71. {
  72. STARPU_PTHREAD_COND_WAIT(&cond_gpu, &mutex_gpu);
  73. }
  74. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_gpu);
  75. //fprintf(stderr, "RECV on %d\n", dst);
  76. /* Upload data */
  77. #ifdef DO_TRANSFER_RAM_TO_GPU
  78. #ifdef ASYNC
  79. cures = cudaMemcpyAsync(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyHostToDevice, stream[dst]);
  80. STARPU_ASSERT(!cures);
  81. cures = cudaStreamSynchronize(stream[dst]);
  82. STARPU_ASSERT(!cures);
  83. #else
  84. cures = cudaMemcpy(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyHostToDevice);
  85. STARPU_ASSERT(!cures);
  86. cures = cudaThreadSynchronize();
  87. STARPU_ASSERT(!cures);
  88. #endif
  89. #endif
  90. }
  91. void *launch_gpu_thread(void *arg)
  92. {
  93. unsigned *idptr = arg;
  94. unsigned id = *idptr;
  95. starpu_cuda_set_device(id);
  96. cudaFree(0);
  97. cudaMalloc(&gpu_buffer[id], buffer_size);
  98. cudaStreamCreate(&stream[id]);
  99. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  100. thread_is_initialized[id] = 1;
  101. STARPU_PTHREAD_COND_SIGNAL(&cond);
  102. if (id == 0)
  103. {
  104. cudaError_t cures;
  105. cures = cudaHostAlloc(&cpu_buffer, buffer_size, cudaHostAllocPortable);
  106. STARPU_ASSERT(!cures);
  107. cudaThreadSynchronize();
  108. }
  109. nready_gpu++;
  110. while (!ready)
  111. STARPU_PTHREAD_COND_WAIT(&cond_go, &mutex);
  112. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  113. unsigned iter;
  114. for (iter = 0; iter < niter; iter++)
  115. {
  116. if (id == 0)
  117. {
  118. send_data(0, 1);
  119. recv_data(1, 0);
  120. }
  121. else
  122. {
  123. recv_data(0, 1);
  124. send_data(1, 0);
  125. }
  126. }
  127. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  128. nready_gpu--;
  129. STARPU_PTHREAD_COND_SIGNAL(&cond_go);
  130. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  131. return NULL;
  132. }
  133. int main(int argc, char **argv)
  134. {
  135. STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
  136. STARPU_PTHREAD_COND_INIT(&cond, NULL);
  137. STARPU_PTHREAD_COND_INIT(&cond_go, NULL);
  138. unsigned id;
  139. for (id = 0; id < 2; id++)
  140. {
  141. thread_is_initialized[id] = 0;
  142. STARPU_PTHREAD_CREATE(&thread[0], NULL, launch_gpu_thread, &id);
  143. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  144. while (!thread_is_initialized[id])
  145. {
  146. STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
  147. }
  148. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  149. }
  150. double start;
  151. double end;
  152. /* Start the ping pong */
  153. start = starpu_timing_now();
  154. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  155. ready = 1;
  156. STARPU_PTHREAD_COND_BROADCAST(&cond_go);
  157. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  158. /* Wait for the end of the ping pong */
  159. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  160. while (nready_gpu > 0)
  161. {
  162. STARPU_PTHREAD_COND_WAIT(&cond_go, &mutex);
  163. }
  164. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  165. end = starpu_timing_now();
  166. double timing = end - start;
  167. fprintf(stderr, "Took %.0f ms for %u iterations\n", timing/1000, niter);
  168. fprintf(stderr, "Latency: %.2f us\n", timing/(2*niter));
  169. return EXIT_SUCCESS;
  170. }