dw_cholesky_kernels.c 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. /*
  2. * StarPU
  3. * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include "dw_cholesky.h"
  17. #include "../common/blas.h"
  18. /*
  19. * U22
  20. */
  21. static inline void chol_common_core_codelet_update_u22(starpu_data_interface_t *buffers, int s, __attribute__((unused)) void *_args)
  22. {
  23. //printf("22\n");
  24. float *left = (float *)buffers[0].blas.ptr;
  25. float *right = (float *)buffers[1].blas.ptr;
  26. float *center = (float *)buffers[2].blas.ptr;
  27. unsigned dx = buffers[2].blas.ny;
  28. unsigned dy = buffers[2].blas.nx;
  29. unsigned dz = buffers[0].blas.ny;
  30. unsigned ld21 = buffers[0].blas.ld;
  31. unsigned ld12 = buffers[1].blas.ld;
  32. unsigned ld22 = buffers[2].blas.ld;
  33. switch (s) {
  34. case 0:
  35. SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
  36. right, ld12, 1.0f, center, ld22);
  37. break;
  38. #ifdef USE_CUDA
  39. case 1:
  40. cublasSgemm('n', 't', dy, dx, dz,
  41. -1.0f, left, ld21, right, ld12,
  42. 1.0f, center, ld22);
  43. break;
  44. #endif
  45. default:
  46. STARPU_ASSERT(0);
  47. break;
  48. }
  49. }
  50. void chol_core_codelet_update_u22(starpu_data_interface_t *descr, void *_args)
  51. {
  52. chol_common_core_codelet_update_u22(descr, 0, _args);
  53. }
  54. #ifdef USE_CUDA
  55. void chol_cublas_codelet_update_u22(starpu_data_interface_t *descr, void *_args)
  56. {
  57. chol_common_core_codelet_update_u22(descr, 1, _args);
  58. }
  59. #endif// USE_CUDA
  60. /*
  61. * U21
  62. */
  63. static inline void chol_common_codelet_update_u21(starpu_data_interface_t *buffers, int s, __attribute__((unused)) void *_args)
  64. {
  65. // printf("21\n");
  66. float *sub11;
  67. float *sub21;
  68. sub11 = (float *)buffers[0].blas.ptr;
  69. sub21 = (float *)buffers[1].blas.ptr;
  70. unsigned ld11 = buffers[0].blas.ld;
  71. unsigned ld21 = buffers[1].blas.ld;
  72. unsigned nx21 = buffers[1].blas.ny;
  73. unsigned ny21 = buffers[1].blas.nx;
  74. switch (s) {
  75. case 0:
  76. STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
  77. break;
  78. #ifdef USE_CUDA
  79. case 1:
  80. cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
  81. break;
  82. #endif
  83. default:
  84. STARPU_ASSERT(0);
  85. break;
  86. }
  87. }
  88. void chol_core_codelet_update_u21(starpu_data_interface_t *descr, void *_args)
  89. {
  90. chol_common_codelet_update_u21(descr, 0, _args);
  91. }
  92. #ifdef USE_CUDA
  93. void chol_cublas_codelet_update_u21(starpu_data_interface_t *descr, void *_args)
  94. {
  95. chol_common_codelet_update_u21(descr, 1, _args);
  96. }
  97. #endif
  98. /*
  99. * U11
  100. */
  101. static inline void chol_common_codelet_update_u11(starpu_data_interface_t *descr, int s, __attribute__((unused)) void *_args)
  102. {
  103. // printf("11\n");
  104. float *sub11;
  105. sub11 = (float *)descr[0].blas.ptr;
  106. unsigned nx = descr[0].blas.ny;
  107. unsigned ld = descr[0].blas.ld;
  108. unsigned z;
  109. switch (s) {
  110. case 0:
  111. /*
  112. * - alpha 11 <- lambda 11 = sqrt(alpha11)
  113. * - alpha 21 <- l 21 = alpha 21 / lambda 11
  114. * - A22 <- A22 - l21 trans(l21)
  115. */
  116. for (z = 0; z < nx; z++)
  117. {
  118. float lambda11;
  119. lambda11 = sqrt(sub11[z+z*ld]);
  120. sub11[z+z*ld] = lambda11;
  121. STARPU_ASSERT(lambda11 != 0.0f);
  122. SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
  123. SSYR("L", nx - z - 1, -1.0f,
  124. &sub11[(z+1)+z*ld], 1,
  125. &sub11[(z+1)+(z+1)*ld], ld);
  126. }
  127. break;
  128. #ifdef USE_CUDA
  129. case 1:
  130. for (z = 0; z < nx; z++)
  131. {
  132. float lambda11;
  133. /* ok that's dirty and ridiculous ... */
  134. cublasGetVector(1, sizeof(float), &sub11[z+z*ld], sizeof(float), &lambda11, sizeof(float));
  135. lambda11 = sqrt(lambda11);
  136. cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
  137. STARPU_ASSERT(lambda11 != 0.0f);
  138. cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
  139. cublasSsyr('U', nx - z - 1, -1.0f,
  140. &sub11[(z+1)+z*ld], 1,
  141. &sub11[(z+1)+(z+1)*ld], ld);
  142. }
  143. break;
  144. #endif
  145. default:
  146. STARPU_ASSERT(0);
  147. break;
  148. }
  149. }
  150. void chol_core_codelet_update_u11(starpu_data_interface_t *descr, void *_args)
  151. {
  152. chol_common_codelet_update_u11(descr, 0, _args);
  153. }
  154. #ifdef USE_CUDA
  155. void chol_cublas_codelet_update_u11(starpu_data_interface_t *descr, void *_args)
  156. {
  157. chol_common_codelet_update_u11(descr, 1, _args);
  158. }
  159. #endif// USE_CUDA