cuda_latency.c 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010, 2011 Centre National de la Recherche Scientifique
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include <pthread.h>
  17. #include <stdio.h>
  18. #include <cuda.h>
  19. #include <cuda_runtime.h>
  20. #include <assert.h>
  21. #include <sys/types.h>
  22. #include <sys/time.h>
  23. static pthread_t thread[2];
  24. static unsigned thread_is_initialized[2];
  25. static _starpu_pthread_cond_t cond;
  26. static _starpu_pthread_mutex_t mutex;
  27. static size_t buffer_size = 4;
  28. static void *cpu_buffer;
  29. static void *gpu_buffer[2];
  30. static pthread_cond_t cond_go;
  31. static unsigned ready = 0;
  32. static unsigned nready_gpu = 0;
  33. static unsigned niter = 250000;
  34. static _starpu_pthread_cond_t cond_gpu;
  35. static _starpu_pthread_mutex_t mutex_gpu;
  36. static unsigned data_is_available[2];
  37. static cudaStream_t stream[2];
  38. #define ASYNC 1
  39. #define DO_TRANSFER_GPU_TO_RAM 1
  40. #define DO_TRANSFER_RAM_TO_GPU 1
  41. void send_data(unsigned src, unsigned dst)
  42. {
  43. cudaError_t cures;
  44. /* Copy data from GPU to RAM */
  45. #ifdef DO_TRANSFER_GPU_TO_RAM
  46. #ifdef ASYNC
  47. cures = cudaMemcpyAsync(cpu_buffer, gpu_buffer[src], buffer_size, cudaMemcpyDeviceToHost, stream[src]);
  48. STARPU_ASSERT(!cures);
  49. cures = cudaStreamSynchronize(stream[src]);
  50. STARPU_ASSERT(!cures);
  51. #else
  52. cures = cudaMemcpy(cpu_buffer, gpu_buffer[src], buffer_size, cudaMemcpyDeviceToHost);
  53. STARPU_ASSERT(!cures);
  54. cures = cudaThreadSynchronize();
  55. STARPU_ASSERT(!cures);
  56. #endif
  57. #endif
  58. /* Tell the other GPU that data is in RAM */
  59. _STARPU_PTHREAD_MUTEX_LOCK(&mutex_gpu);
  60. data_is_available[src] = 0;
  61. data_is_available[dst] = 1;
  62. _STARPU_PTHREAD_COND_SIGNAL(&cond_gpu);
  63. _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_gpu);
  64. //fprintf(stderr, "SEND on %d\n", src);
  65. }
  66. void recv_data(unsigned src, unsigned dst)
  67. {
  68. cudaError_t cures;
  69. /* Wait for the data to be in RAM */
  70. _STARPU_PTHREAD_MUTEX_LOCK(&mutex_gpu);
  71. while (!data_is_available[dst])
  72. {
  73. _STARPU_PTHREAD_COND_WAIT(&cond_gpu, &mutex_gpu);
  74. }
  75. _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_gpu);
  76. //fprintf(stderr, "RECV on %d\n", dst);
  77. /* Upload data */
  78. #ifdef DO_TRANSFER_RAM_TO_GPU
  79. #ifdef ASYNC
  80. cures = cudaMemcpyAsync(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyHostToDevice, stream[dst]);
  81. STARPU_ASSERT(!cures);
  82. cures = cudaStreamSynchronize(stream[dst]);
  83. STARPU_ASSERT(!cures);
  84. #else
  85. cures = cudaMemcpy(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyHostToDevice);
  86. STARPU_ASSERT(!cures);
  87. cures = cudaThreadSynchronize();
  88. STARPU_ASSERT(!cures);
  89. #endif
  90. #endif
  91. }
  92. void *launch_gpu_thread(void *arg)
  93. {
  94. unsigned *idptr = arg;
  95. unsigned id = *idptr;
  96. starpu_cuda_set_device(id);
  97. cudaFree(0);
  98. cudaMalloc(&gpu_buffer[id], buffer_size);
  99. cudaStreamCreate(&stream[id]);
  100. _STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  101. thread_is_initialized[id] = 1;
  102. _STARPU_PTHREAD_COND_SIGNAL(&cond);
  103. if (id == 0)
  104. {
  105. cudaError_t cures;
  106. cures = cudaHostAlloc(&cpu_buffer, buffer_size, cudaHostAllocPortable);
  107. STARPU_ASSERT(!cures);
  108. cudaThreadSynchronize();
  109. }
  110. nready_gpu++;
  111. while (!ready)
  112. _STARPU_PTHREAD_COND_WAIT(&cond_go, &mutex);
  113. _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  114. unsigned iter;
  115. for (iter = 0; iter < niter; iter++)
  116. {
  117. if (id == 0)
  118. {
  119. send_data(0, 1);
  120. recv_data(1, 0);
  121. }
  122. else
  123. {
  124. recv_data(0, 1);
  125. send_data(1, 0);
  126. }
  127. }
  128. _STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  129. nready_gpu--;
  130. _STARPU_PTHREAD_COND_SIGNAL(&cond_go);
  131. _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  132. return NULL;
  133. }
  134. int main(int argc, char **argv)
  135. {
  136. _STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
  137. _STARPU_PTHREAD_COND_INIT(&cond, NULL);
  138. _STARPU_PTHREAD_COND_INIT(&cond_go, NULL);
  139. unsigned id;
  140. for (id = 0; id < 2; id++)
  141. {
  142. thread_is_initialized[id] = 0;
  143. pthread_create(&thread[0], NULL, launch_gpu_thread, &id);
  144. _STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  145. while (!thread_is_initialized[id])
  146. {
  147. _STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
  148. }
  149. _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  150. }
  151. struct timeval start;
  152. struct timeval end;
  153. /* Start the ping pong */
  154. gettimeofday(&start, NULL);
  155. _STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  156. ready = 1;
  157. _STARPU_PTHREAD_COND_BROADCAST(&cond_go);
  158. _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  159. /* Wait for the end of the ping pong */
  160. _STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  161. while (nready_gpu > 0)
  162. {
  163. _STARPU_PTHREAD_COND_WAIT(&cond_go, &mutex);
  164. }
  165. _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  166. gettimeofday(&end, NULL);
  167. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
  168. (end.tv_usec - start.tv_usec));
  169. fprintf(stderr, "Took %.0f ms for %d iterations\n", timing/1000, niter);
  170. fprintf(stderr, "Latency: %.2f us\n", timing/(2*niter));
  171. return EXIT_SUCCESS;
  172. }