cuda-latency.c 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. /*
  2. * StarPU
  3. * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include <pthread.h>
  17. #include <stdio.h>
  18. #include <cuda.h>
  19. #include <cuda_runtime.h>
  20. #include <assert.h>
  21. #include <string.h>
  22. #include <math.h>
  23. #include <sys/types.h>
  24. #include <sys/time.h>
  25. #include <pthread.h>
  26. #include <signal.h>
  27. static pthread_t thread[2];
  28. static unsigned thread_is_initialized[2];
  29. static pthread_cond_t cond;
  30. static pthread_mutex_t mutex;
  31. static size_t buffer_size = 4;
  32. static void *cpu_buffer;
  33. static void *gpu_buffer[2];
  34. static pthread_cond_t cond_go;
  35. static unsigned ready = 0;
  36. static unsigned nready_gpu = 0;
  37. static unsigned niter = 250000;
  38. static pthread_cond_t cond_gpu;
  39. static pthread_mutex_t mutex_gpu;
  40. static unsigned data_is_available[2];
  41. static cudaStream_t stream[2];
  42. #define ASYNC 1
  43. #define DO_TRANSFER_GPU_TO_RAM 1
  44. #define DO_TRANSFER_RAM_TO_GPU 1
  45. void send_data(unsigned src, unsigned dst)
  46. {
  47. cudaError_t cures;
  48. /* Copy data from GPU to RAM */
  49. #ifdef DO_TRANSFER_GPU_TO_RAM
  50. #ifdef ASYNC
  51. cures = cudaMemcpyAsync(cpu_buffer, gpu_buffer[src], buffer_size, cudaMemcpyDeviceToHost, stream[src]);
  52. assert(!cures);
  53. cures = cudaStreamSynchronize(stream[src]);
  54. assert(!cures);
  55. #else
  56. cures = cudaMemcpy(cpu_buffer, gpu_buffer[src], buffer_size, cudaMemcpyDeviceToHost);
  57. assert(!cures);
  58. cures = cudaThreadSynchronize();
  59. assert(!cures);
  60. #endif
  61. #endif
  62. /* Tell the other GPU that data is in RAM */
  63. pthread_mutex_lock(&mutex_gpu);
  64. data_is_available[src] = 0;
  65. data_is_available[dst] = 1;
  66. pthread_cond_signal(&cond_gpu);
  67. pthread_mutex_unlock(&mutex_gpu);
  68. //fprintf(stderr, "SEND on %d\n", src);
  69. }
  70. void recv_data(unsigned src, unsigned dst)
  71. {
  72. cudaError_t cures;
  73. /* Wait for the data to be in RAM */
  74. pthread_mutex_lock(&mutex_gpu);
  75. while (!data_is_available[dst])
  76. {
  77. pthread_cond_wait(&cond_gpu, &mutex_gpu);
  78. }
  79. pthread_mutex_unlock(&mutex_gpu);
  80. //fprintf(stderr, "RECV on %d\n", dst);
  81. /* Upload data */
  82. #ifdef DO_TRANSFER_RAM_TO_GPU
  83. #ifdef ASYNC
  84. cures = cudaMemcpyAsync(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyHostToDevice, stream[dst]);
  85. assert(!cures);
  86. cures = cudaThreadSynchronize();
  87. assert(!cures);
  88. #else
  89. cures = cudaMemcpy(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyHostToDevice);
  90. assert(!cures);
  91. cures = cudaThreadSynchronize();
  92. assert(!cures);
  93. #endif
  94. #endif
  95. }
  96. void *launch_gpu_thread(void *arg)
  97. {
  98. unsigned *idptr = arg;
  99. unsigned id = *idptr;
  100. cudaSetDevice(id);
  101. cudaFree(0);
  102. cudaMalloc(&gpu_buffer[id], buffer_size);
  103. cudaStreamCreate(&stream[id]);
  104. pthread_mutex_lock(&mutex);
  105. thread_is_initialized[id] = 1;
  106. pthread_cond_signal(&cond);
  107. nready_gpu++;
  108. while (!ready)
  109. pthread_cond_wait(&cond_go, &mutex);
  110. pthread_mutex_unlock(&mutex);
  111. unsigned iter;
  112. for (iter = 0; iter < niter; iter++)
  113. {
  114. if (id == 0) {
  115. send_data(0, 1);
  116. recv_data(1, 0);
  117. }
  118. else {
  119. recv_data(0, 1);
  120. send_data(1, 0);
  121. }
  122. }
  123. pthread_mutex_lock(&mutex);
  124. nready_gpu--;
  125. pthread_cond_signal(&cond_go);
  126. pthread_mutex_unlock(&mutex);
  127. return NULL;
  128. }
  129. int main(int argc, char **argv)
  130. {
  131. pthread_mutex_init(&mutex, NULL);
  132. pthread_cond_init(&cond, NULL);
  133. pthread_cond_init(&cond_go, NULL);
  134. cudaError_t cures;
  135. cures = cudaHostAlloc(&cpu_buffer, buffer_size, cudaHostAllocPortable);
  136. assert(!cures);
  137. unsigned id;
  138. for (id = 0; id < 2; id++)
  139. {
  140. thread_is_initialized[id] = 0;
  141. pthread_create(&thread[0], NULL, launch_gpu_thread, &id);
  142. pthread_mutex_lock(&mutex);
  143. while (!thread_is_initialized[id])
  144. {
  145. pthread_cond_wait(&cond, &mutex);
  146. }
  147. pthread_mutex_unlock(&mutex);
  148. }
  149. struct timeval start;
  150. struct timeval end;
  151. /* Start the ping pong */
  152. gettimeofday(&start, NULL);
  153. pthread_mutex_lock(&mutex);
  154. ready = 1;
  155. pthread_cond_broadcast(&cond_go);
  156. pthread_mutex_unlock(&mutex);
  157. /* Wait for the end of the ping pong */
  158. pthread_mutex_lock(&mutex);
  159. while (nready_gpu > 0)
  160. {
  161. pthread_cond_wait(&cond_go, &mutex);
  162. }
  163. pthread_mutex_unlock(&mutex);
  164. gettimeofday(&end, NULL);
  165. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
  166. (end.tv_usec - start.tv_usec));
  167. fprintf(stderr, "Took %.0f ms for %d iterations\n", timing/1000, niter);
  168. fprintf(stderr, "Latency: %.2f us\n", timing/(2*niter));
  169. return 0;
  170. }