cholesky.h 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2014 Université de Bordeaux 1
  4. * Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #ifndef __DW_CHOLESKY_H__
  18. #define __DW_CHOLESKY_H__
  19. #include <limits.h>
  20. #include <string.h>
  21. #include <math.h>
  22. #include <sys/time.h>
  23. #ifdef STARPU_USE_CUDA
  24. #include <cuda.h>
  25. #include <cuda_runtime.h>
  26. #include <cublas.h>
  27. #endif
  28. #include <common/blas.h>
  29. #include <starpu.h>
  30. #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
  31. #define NMAXBLOCKS 32
  32. #define TAG11(k) ((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
  33. #define TAG21(k,j) ((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32) \
  34. | (unsigned long long)(j))))
  35. #define TAG22(k,i,j) ((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) \
  36. | ((unsigned long long)(i)<<16) \
  37. | (unsigned long long)(j))))
  38. #define TAG11_AUX(k, prefix) ((starpu_tag_t)( (((unsigned long long)(prefix))<<60) | (1ULL<<56) | (unsigned long long)(k)))
  39. #define TAG21_AUX(k,j, prefix) ((starpu_tag_t)( (((unsigned long long)(prefix))<<60) \
  40. | ((3ULL<<56) | (((unsigned long long)(k))<<32) \
  41. | (unsigned long long)(j))))
  42. #define TAG22_AUX(k,i,j, prefix) ((starpu_tag_t)( (((unsigned long long)(prefix))<<60) \
  43. | ((4ULL<<56) | ((unsigned long long)(k)<<32) \
  44. | ((unsigned long long)(i)<<16) \
  45. | (unsigned long long)(j))))
  46. #define BLOCKSIZE (size/nblocks)
  47. #define BLAS3_FLOP(n1,n2,n3) \
  48. (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
  49. /* This is from magma
  50. -- Innovative Computing Laboratory
  51. -- Electrical Engineering and Computer Science Department
  52. -- University of Tennessee
  53. -- (C) Copyright 2009
  54. Redistribution and use in source and binary forms, with or without
  55. modification, are permitted provided that the following conditions
  56. are met:
  57. * Redistributions of source code must retain the above copyright
  58. notice, this list of conditions and the following disclaimer.
  59. * Redistributions in binary form must reproduce the above copyright
  60. notice, this list of conditions and the following disclaimer in the
  61. documentation and/or other materials provided with the distribution.
  62. * Neither the name of the University of Tennessee, Knoxville nor the
  63. names of its contributors may be used to endorse or promote products
  64. derived from this software without specific prior written permission.
  65. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  66. ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  67. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  68. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  69. HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  70. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  71. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  72. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  73. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  74. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  75. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  76. */
  77. #define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
  78. #define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) ) * (double)(__n) - (1. / 6.)))
  79. #define FLOPS_SPOTRF(__n) ( FMULS_POTRF((__n)) + FADDS_POTRF((__n)) )
  80. #define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
  81. #define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
  82. #define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
  83. #define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
  84. #define FMULS_TRSM FMULS_TRMM
  85. #define FADDS_TRSM FMULS_TRMM
  86. #define FLOPS_STRSM(__m, __n) ( FMULS_TRSM((__m), (__n)) + FADDS_TRSM((__m), (__n)) )
  87. #define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
  88. #define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
  89. #define FLOPS_SGEMM(__m, __n, __k) ( FMULS_GEMM((__m), (__n), (__k)) + FADDS_GEMM((__m), (__n), (__k)) )
  90. /* End of magma code */
  91. static unsigned size = 4*1024;
  92. static unsigned nblocks = 16;
  93. static unsigned nbigblocks = 8;
  94. static unsigned pinned = 1;
  95. static unsigned noprio = 0;
  96. static unsigned check = 0;
  97. static unsigned bound = 0;
  98. static unsigned bound_deps = 0;
  99. static unsigned bound_lp = 0;
  100. static unsigned bound_mps = 0;
  101. static unsigned with_ctxs = 0;
  102. static unsigned with_noctxs = 0;
  103. static unsigned chole1 = 0;
  104. static unsigned chole2 = 0;
  105. struct starpu_perfmodel chol_model_11;
  106. struct starpu_perfmodel chol_model_21;
  107. struct starpu_perfmodel chol_model_22;
  108. struct starpu_codelet cl11;
  109. struct starpu_codelet cl21;
  110. struct starpu_codelet cl22;
  111. void chol_cpu_codelet_update_u11(void **, void *);
  112. void chol_cpu_codelet_update_u21(void **, void *);
  113. void chol_cpu_codelet_update_u22(void **, void *);
  114. double cpu_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
  115. double cpu_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
  116. double cpu_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
  117. #ifdef STARPU_USE_CUDA
  118. void chol_cublas_codelet_update_u11(void *descr[], void *_args);
  119. void chol_cublas_codelet_update_u21(void *descr[], void *_args);
  120. void chol_cublas_codelet_update_u22(void *descr[], void *_args);
  121. double cuda_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
  122. double cuda_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
  123. double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
  124. #endif
  125. void initialize_chol_model(struct starpu_perfmodel* model, char* symbol,
  126. double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
  127. double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned));
  128. static void STARPU_ATTRIBUTE_UNUSED parse_args(int argc, char **argv)
  129. {
  130. int i;
  131. for (i = 1; i < argc; i++)
  132. {
  133. if (strcmp(argv[i], "-with_ctxs") == 0)
  134. {
  135. with_ctxs = 1;
  136. break;
  137. } else
  138. if (strcmp(argv[i], "-with_noctxs") == 0)
  139. {
  140. with_noctxs = 1;
  141. break;
  142. } else
  143. if (strcmp(argv[i], "-chole1") == 0)
  144. {
  145. chole1 = 1;
  146. break;
  147. } else
  148. if (strcmp(argv[i], "-chole2") == 0)
  149. {
  150. chole2 = 1;
  151. break;
  152. } else
  153. if (strcmp(argv[i], "-size") == 0)
  154. {
  155. char *argptr;
  156. size = strtol(argv[++i], &argptr, 10);
  157. } else
  158. if (strcmp(argv[i], "-nblocks") == 0)
  159. {
  160. char *argptr;
  161. nblocks = strtol(argv[++i], &argptr, 10);
  162. } else
  163. if (strcmp(argv[i], "-nbigblocks") == 0)
  164. {
  165. char *argptr;
  166. nbigblocks = strtol(argv[++i], &argptr, 10);
  167. } else
  168. if (strcmp(argv[i], "-no-pin") == 0)
  169. {
  170. pinned = 0;
  171. } else
  172. if (strcmp(argv[i], "-no-prio") == 0)
  173. {
  174. noprio = 1;
  175. } else
  176. if (strcmp(argv[i], "-bound") == 0)
  177. {
  178. bound = 1;
  179. } else
  180. if (strcmp(argv[i], "-bound-lp") == 0)
  181. {
  182. bound_lp = 1;
  183. } else
  184. if (strcmp(argv[i], "-bound-mps") == 0)
  185. {
  186. bound_mps = 1;
  187. } else
  188. if (strcmp(argv[i], "-bound-deps") == 0)
  189. {
  190. bound_deps = 1;
  191. } else
  192. if (strcmp(argv[i], "-check") == 0)
  193. {
  194. check = 1;
  195. } else
  196. /* if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i],"--help") == 0) */
  197. {
  198. fprintf(stderr,"usage : %s [-size size] [-nblocks nblocks] [-no-pin] [-no-prio] [-bound] [-bound-deps] [-bound-lp] [-check]\n", argv[0]);
  199. fprintf(stderr,"Currently selected: %ux%u and %ux%u blocks\n", size, size, nblocks, nblocks);
  200. exit(0);
  201. }
  202. }
  203. }
  204. #endif /* __DW_CHOLESKY_H__ */