strassen2_kernels.c 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. /*
  2. * StarPU
  3. * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include <stdio.h>
  17. #include <stdint.h>
  18. #include <string.h>
  19. #include <math.h>
  20. #include <sys/types.h>
  21. #include <pthread.h>
  22. #include <signal.h>
  23. #include <semaphore.h>
  24. #include <starpu_config.h>
  25. #ifdef USE_CUDA
  26. #include <cublas.h>
  27. #endif
  28. #include "../common/blas.h"
  29. #include <starpu.h>
  30. static double cublas_flop = 0.0;
  31. static double cpus_flop = 0.0;
  32. void display_perf(double timing, unsigned size)
  33. {
  34. double total_flop_n3 = (2.0*size*size*size);
  35. double total_flop = cublas_flop + cpus_flop;
  36. fprintf(stderr, "Computation took (ms):\n");
  37. printf("%2.2f\n", timing/1000);
  38. fprintf(stderr, " GFlop : O(n3) -> %2.2f\n",
  39. (double)total_flop_n3/1000000000.0f);
  40. fprintf(stderr, " GFlop : real %2.2f\n",
  41. (double)total_flop/1000000000.0f);
  42. fprintf(stderr, " CPU : %2.2f (%2.2f%%)\n", (double)cpus_flop/1000000000.0, (100.0*cpus_flop)/(cpus_flop + cublas_flop));
  43. fprintf(stderr, " GPU : %2.2f (%2.2f%%)\n", (double)cublas_flop/1000000000.0, (100.0*cublas_flop)/(cpus_flop + cublas_flop));
  44. fprintf(stderr, " GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
  45. }
  46. static void mult_common_codelet(starpu_data_interface_t *buffers, int s, __attribute__((unused)) void *arg)
  47. {
  48. float *center = (float *)buffers[0].blas.ptr;
  49. float *left = (float *)buffers[1].blas.ptr;
  50. float *right = (float *)buffers[2].blas.ptr;
  51. unsigned n = buffers[0].blas.nx;
  52. unsigned ld21 = buffers[1].blas.ld;
  53. unsigned ld12 = buffers[2].blas.ld;
  54. unsigned ld22 = buffers[0].blas.ld;
  55. double flop = 2.0*n*n*n;
  56. switch (s) {
  57. case 0:
  58. cpus_flop += flop;
  59. SGEMM("N", "N", n, n, n, 1.0f, right, ld21, left, ld12, 0.0f, center, ld22);
  60. break;
  61. #ifdef USE_CUDA
  62. case 1:
  63. cublas_flop += flop;
  64. cublasSgemm('n', 'n', n, n, n, 1.0f, right, ld12, left, ld21, 0.0f, center, ld22);
  65. break;
  66. #endif
  67. default:
  68. STARPU_ASSERT(0);
  69. break;
  70. }
  71. }
  72. void mult_core_codelet(starpu_data_interface_t *descr, void *_args)
  73. {
  74. mult_common_codelet(descr, 0, _args);
  75. }
  76. #ifdef USE_CUDA
  77. void mult_cublas_codelet(starpu_data_interface_t *descr, void *_args)
  78. {
  79. mult_common_codelet(descr, 1, _args);
  80. }
  81. #endif
  82. static void add_sub_common_codelet(starpu_data_interface_t *buffers, int s, __attribute__((unused)) void *arg, float alpha)
  83. {
  84. /* C = A op B */
  85. float *C = (float *)buffers[0].blas.ptr;
  86. float *A = (float *)buffers[1].blas.ptr;
  87. float *B = (float *)buffers[2].blas.ptr;
  88. unsigned n = buffers[0].blas.nx;
  89. unsigned ldA = buffers[1].blas.ld;
  90. unsigned ldB = buffers[2].blas.ld;
  91. unsigned ldC = buffers[0].blas.ld;
  92. double flop = 2.0*n*n;
  93. // TODO check dim ...
  94. unsigned line;
  95. switch (s) {
  96. case 0:
  97. cpus_flop += flop;
  98. for (line = 0; line < n; line++)
  99. {
  100. /* copy line A into C */
  101. SAXPY(n, 1.0f, &A[line*ldA], 1, &C[line*ldC], 1);
  102. /* add line B to C = A */
  103. SAXPY(n, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
  104. }
  105. break;
  106. #ifdef USE_CUDA
  107. case 1:
  108. cublas_flop += flop;
  109. for (line = 0; line < n; line++)
  110. {
  111. /* copy line A into C */
  112. cublasSaxpy(n, 1.0f, &A[line*ldA], 1, &C[line*ldC], 1);
  113. /* add line B to C = A */
  114. cublasSaxpy(n, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
  115. }
  116. break;
  117. #endif
  118. default:
  119. STARPU_ASSERT(0);
  120. break;
  121. }
  122. }
  123. void sub_core_codelet(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
  124. {
  125. add_sub_common_codelet(descr, 0, arg, -1.0f);
  126. }
  127. void add_core_codelet(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
  128. {
  129. add_sub_common_codelet(descr, 0, arg, 1.0f);
  130. }
  131. #ifdef USE_CUDA
  132. void sub_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
  133. {
  134. add_sub_common_codelet(descr, 1, arg, -1.0f);
  135. }
  136. void add_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
  137. {
  138. add_sub_common_codelet(descr, 1, arg, 1.0f);
  139. }
  140. #endif
  141. static void self_add_sub_common_codelet(starpu_data_interface_t *buffers, int s, __attribute__((unused)) void *arg, float alpha)
  142. {
  143. /* C +=/-= A */
  144. float *C = (float *)buffers[0].blas.ptr;
  145. float *A = (float *)buffers[1].blas.ptr;
  146. unsigned n = buffers[0].blas.nx;
  147. unsigned ldA = buffers[1].blas.ld;
  148. unsigned ldC = buffers[0].blas.ld;
  149. double flop = 1.0*n*n;
  150. // TODO check dim ...
  151. unsigned line;
  152. switch (s) {
  153. case 0:
  154. cpus_flop += flop;
  155. for (line = 0; line < n; line++)
  156. {
  157. /* add line A to C */
  158. SAXPY(n, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
  159. }
  160. break;
  161. #ifdef USE_CUDA
  162. case 1:
  163. cublas_flop += flop;
  164. for (line = 0; line < n; line++)
  165. {
  166. /* add line A to C */
  167. cublasSaxpy(n, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
  168. }
  169. break;
  170. #endif
  171. default:
  172. STARPU_ASSERT(0);
  173. break;
  174. }
  175. }
  176. void self_add_core_codelet(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
  177. {
  178. self_add_sub_common_codelet(descr, 0, arg, 1.0f);
  179. }
  180. void self_sub_core_codelet(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
  181. {
  182. self_add_sub_common_codelet(descr, 0, arg, -1.0f);
  183. }
  184. #ifdef USE_CUDA
  185. void self_add_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
  186. {
  187. self_add_sub_common_codelet(descr, 1, arg, 1.0f);
  188. }
  189. void self_sub_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
  190. {
  191. self_add_sub_common_codelet(descr, 1, arg, -1.0f);
  192. }
  193. #endif
  194. /* this codelet does nothing */
  195. void null_codelet(__attribute__((unused)) starpu_data_interface_t *descr,
  196. __attribute__((unused)) void *arg)
  197. {
  198. }