dw_mult.h 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009, 2010 Université de Bordeaux 1
  4. * Copyright (C) 2010 Centre National de la Recherche Scientifique
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #ifndef __MULT_H__
  18. #define __MULT_H__
  19. #include <string.h>
  20. #include <math.h>
  21. #include <sys/types.h>
  22. #include <sys/time.h>
  23. #include <pthread.h>
  24. #include <signal.h>
  25. #include <common/blas.h>
  26. #include <common/blas_model.h>
  27. #include <starpu.h>
  28. #ifdef STARPU_USE_CUDA
  29. #include <cuda.h>
  30. #include <cublas.h>
  31. #endif
  32. #define MAXSLICESX 64
  33. #define MAXSLICESY 64
  34. #define MAXSLICESZ 64
  35. #define BLAS3_FLOP(n1,n2,n3) \
  36. (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
  37. #define BLAS3_LS(n1,n2,n3) \
  38. ((2*(n1)*(n3) + (n1)*(n2) + (n2)*(n3))*sizeof(float))
  39. struct block_conf {
  40. uint32_t m;
  41. uint32_t n;
  42. uint32_t k;
  43. uint32_t pad;
  44. };
  45. #define NITER 100
  46. unsigned niter = NITER;
  47. unsigned nslicesx = 4;
  48. unsigned nslicesy = 4;
  49. unsigned nslicesz = 4;
  50. unsigned xdim = 256;
  51. unsigned ydim = 256;
  52. unsigned zdim = 64;
  53. unsigned norandom = 0;
  54. unsigned pin = 0;
  55. unsigned use_common_model = 0;
  56. unsigned check = 0;
  57. /* to compute MFlop/s */
  58. uint64_t flop_cublas = 0;
  59. uint64_t flop_atlas = 0;
  60. uint64_t flop_per_worker[STARPU_NMAXWORKERS] = {0};
  61. /* to compute MB/s (load/store) */
  62. uint64_t ls_cublas = 0;
  63. uint64_t ls_atlas = 0;
  64. uint64_t ls_per_worker[STARPU_NMAXWORKERS] = {0};
  65. struct timeval start;
  66. struct timeval end;
  67. static int taskcounter __attribute__ ((unused));
  68. static struct block_conf conf __attribute__ ((aligned (128)));
  69. #define BLOCKSIZEX (xdim / nslicesx)
  70. #define BLOCKSIZEY (ydim / nslicesy)
  71. #define BLOCKSIZEZ (zdim / nslicesz)
  72. static void display_stats(double timing)
  73. {
  74. unsigned worker;
  75. unsigned nworkers = starpu_worker_get_count();
  76. fprintf(stderr, "Computation took (ms):\n");
  77. printf("%2.2f\n", timing/1000);
  78. uint64_t flop_total = 0, ls_total = 0;
  79. for (worker = 0; worker < nworkers; worker++)
  80. {
  81. flop_total += flop_per_worker[worker];
  82. ls_total += ls_per_worker[worker];
  83. char name[32];
  84. starpu_worker_get_name(worker, name, 32);
  85. fprintf(stderr, "\t%s -> %2.2f GFlop\t%2.2f GFlop/s\n", name, (double)flop_per_worker[worker]/1000000000.0f, (double)flop_per_worker[worker]/(double)timing/1000);
  86. }
  87. fprintf(stderr, "Total: %2.2f GFlops\t%2.2f GFlop/s\n", (double)flop_total/1000000000.0f, (double)flop_total/(double)timing/1000);
  88. }
  89. static void parse_args(int argc, char **argv)
  90. {
  91. int i;
  92. for (i = 1; i < argc; i++) {
  93. if (strcmp(argv[i], "-nblocks") == 0) {
  94. char *argptr;
  95. nslicesx = strtol(argv[++i], &argptr, 10);
  96. nslicesy = nslicesx;
  97. nslicesz = nslicesx;
  98. }
  99. if (strcmp(argv[i], "-nblocksx") == 0) {
  100. char *argptr;
  101. nslicesx = strtol(argv[++i], &argptr, 10);
  102. }
  103. if (strcmp(argv[i], "-nblocksy") == 0) {
  104. char *argptr;
  105. nslicesy = strtol(argv[++i], &argptr, 10);
  106. }
  107. if (strcmp(argv[i], "-nblocksz") == 0) {
  108. char *argptr;
  109. nslicesz = strtol(argv[++i], &argptr, 10);
  110. }
  111. if (strcmp(argv[i], "-x") == 0) {
  112. char *argptr;
  113. xdim = strtol(argv[++i], &argptr, 10);
  114. }
  115. if (strcmp(argv[i], "-y") == 0) {
  116. char *argptr;
  117. ydim = strtol(argv[++i], &argptr, 10);
  118. }
  119. if (strcmp(argv[i], "-z") == 0) {
  120. char *argptr;
  121. zdim = strtol(argv[++i], &argptr, 10);
  122. }
  123. if (strcmp(argv[i], "-iter") == 0) {
  124. char *argptr;
  125. niter = strtol(argv[++i], &argptr, 10);
  126. }
  127. if (strcmp(argv[i], "-no-random") == 0) {
  128. norandom = 1;
  129. }
  130. if (strcmp(argv[i], "-pin") == 0) {
  131. pin = 1;
  132. }
  133. if (strcmp(argv[i], "-check") == 0) {
  134. check = 1;
  135. }
  136. if (strcmp(argv[i], "-common-model") == 0) {
  137. use_common_model = 1;
  138. }
  139. }
  140. assert(nslicesx <= MAXSLICESX);
  141. assert(nslicesy <= MAXSLICESY);
  142. assert(nslicesz <= MAXSLICESZ);
  143. }
  144. static void display_memory_consumption(void)
  145. {
  146. fprintf(stderr, "Total memory : %ld MB\n",
  147. (MAXSLICESY*MAXSLICESZ*sizeof(TYPE *)
  148. + MAXSLICESZ*MAXSLICESX*sizeof(TYPE *)
  149. + MAXSLICESY*MAXSLICESX*sizeof(TYPE *)
  150. + MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle)
  151. + MAXSLICESZ*MAXSLICESX*sizeof(starpu_data_handle)
  152. + MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle)
  153. + ydim*zdim*sizeof(TYPE)
  154. + zdim*xdim*sizeof(TYPE)
  155. + ydim*xdim*sizeof(TYPE))/(1024*1024) );
  156. }
  157. #ifdef STARPU_USE_CUDA
  158. void cublas_mult(void *descr[], __attribute__((unused)) void *arg);
  159. #endif
  160. void cpu_mult(void *descr[], __attribute__((unused)) void *arg);
  161. #endif // __MULT_H__