matrix-mult.c 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. Copyright (C) 2011, 2012 Inria
  3. Copyright (C) 2010 Sylvain Gault
  4. StarPU is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU Lesser General Public License as published by
  6. the Free Software Foundation; either version 2.1 of the License, or (at
  7. your option) any later version.
  8. StarPU is distributed in the hope that it will be useful, but
  9. WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */
  11. /* The classical matrix multiplication example, implemented as a StarPU
  12. task invoked for different slices of the matrices. Each invocation may
  13. of course execute in parallel. */
  14. #ifndef STARPU_GCC_PLUGIN
  15. # error must be compiled with the StarPU GCC plug-in
  16. #endif
  17. /* Convenience macro. */
  18. #define __heap __attribute__ ((__heap_allocated__))
  19. #include <stdio.h>
  20. #include <stdlib.h>
  21. #include <math.h>
  22. #include <time.h>
  23. #include <stdint.h>
  24. #include <sys/time.h>
  25. /* Definition of the StarPU task and its CPU implementation. */
  26. static void matmul (const float *A, const float *B, float *C,
  27. size_t nx, size_t ny, size_t nz)
  28. __attribute__ ((task));
  29. static void matmul_cpu (const float *A, const float *B, float *C,
  30. size_t nx, size_t ny, size_t nz)
  31. __attribute__ ((task_implementation ("cpu", matmul)));
  32. static void
  33. matmul_cpu (const float *A, const float *B, float *C,
  34. size_t nx, size_t ny, size_t nz)
  35. {
  36. size_t i, j, k;
  37. for (j = 0; j < ny; j++)
  38. for (i = 0; i < nx; i++)
  39. {
  40. for (k = 0; k < nz; k++)
  41. C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
  42. }
  43. }
  44. static void print_matrix (const float *M, size_t nslicesx,
  45. size_t nslicesy, size_t bxdim,
  46. size_t bydim)
  47. __attribute__ ((unused));
  48. static void
  49. print_matrix (const float *M, size_t nslicesx, size_t nslicesy, size_t bxdim,
  50. size_t bydim)
  51. {
  52. size_t a, b, i, j;
  53. for (b = 0; b < nslicesy; b++)
  54. for (j = 0; j < bydim; j++)
  55. {
  56. for (a = 0; a < nslicesx; a++)
  57. {
  58. for (i = 0; i < bxdim; i++)
  59. printf ("%f ",
  60. M[b * nslicesx * bxdim * bydim + j * bxdim +
  61. a * bxdim * bydim + i]);
  62. }
  63. printf ("\b\n");
  64. }
  65. printf ("\n");
  66. }
  67. static float
  68. my_rand (void)
  69. {
  70. return (float) rand () / (float) RAND_MAX;
  71. }
  72. static double
  73. mean (const double *v, size_t size)
  74. {
  75. double sum = 0;
  76. size_t i;
  77. for (i = 0; i < size; i++)
  78. sum += v[i];
  79. return sum / size;
  80. }
  81. static double
  82. stddev (const double *v, size_t size)
  83. {
  84. double m = mean (v, size);
  85. double sqsum = 0;
  86. size_t i;
  87. for (i = 0; i < size; i++)
  88. sqsum += (v[i] - m) * (v[i] - m);
  89. return sqrt (sqsum / size);
  90. }
  91. int
  92. main (int argc, char **argv)
  93. {
  94. int mloop, nloop = 0;
  95. size_t i, j, k;
  96. size_t nslicesx;
  97. size_t nslicesy;
  98. size_t nslicesz;
  99. size_t xdim, ydim, zdim;
  100. size_t bxdim, bydim, bzdim;
  101. struct timeval start_all, end_all;
  102. struct timeval start_register, end_register;
  103. struct timeval start_tasks, end_tasks;
  104. struct timeval start_unregister, end_unregister;
  105. struct timeval start_compute, end_compute;
  106. if (argc < 4)
  107. {
  108. fprintf (stderr, "Using default values.\nCorrect usage: %s NLOOPS MATRIX-SIZE NSLICES\n", argv[0]);
  109. mloop = nloop = 10;
  110. zdim = ydim = xdim = 16;
  111. nslicesz = nslicesy = nslicesx = 4;
  112. }
  113. else
  114. {
  115. mloop = nloop = atoi (argv[1]);
  116. zdim = ydim = xdim = atoi (argv[2]);
  117. nslicesz = nslicesy = nslicesx = atoi (argv[3]);
  118. }
  119. bxdim = xdim / nslicesx;
  120. bydim = ydim / nslicesy;
  121. bzdim = zdim / nslicesz;
  122. if (xdim % nslicesx)
  123. {
  124. fprintf (stderr, "MATRIX-SIZE must be a multiple of NSLICES\n");
  125. return EXIT_FAILURE;
  126. }
  127. fprintf (stderr, "running %d loops with %ldx%ldx%ld matrices and %ldx%ldx%ld blocks...\n",
  128. nloop,
  129. (long)xdim, (long)ydim, (long)zdim,
  130. (long)bxdim, (long)bydim, (long)bzdim);
  131. double computetime[nloop];
  132. double starttaskstime[nloop];
  133. #pragma starpu initialize
  134. gettimeofday (&start_all, NULL);
  135. float A[zdim * ydim];
  136. float B[xdim * zdim];
  137. float C[xdim * ydim];
  138. srand (time (NULL));
  139. for (i = 0; i < zdim * ydim; i++)
  140. A[i] = my_rand () * 100;
  141. for (i = 0; i < xdim * zdim; i++)
  142. B[i] = my_rand () * 100;
  143. #if 0
  144. print_matrix (A, nslicesz, nslicesy, bzdim, bydim);
  145. print_matrix (B, nslicesx, nslicesz, bxdim, bzdim);
  146. #endif
  147. for (i = 0; i < xdim * ydim; i++)
  148. C[i] = 0;
  149. gettimeofday (&start_register, NULL);
  150. for (i = 0; i < nslicesy; i++)
  151. for (j = 0; j < nslicesz; j++)
  152. #pragma starpu register &A[i*zdim*bydim + j*bzdim*bydim] (bzdim * bydim)
  153. for (i = 0; i < nslicesz; i++)
  154. for (j = 0; j < nslicesx; j++)
  155. #pragma starpu register &B[i*xdim*bzdim + j*bxdim*bzdim] (bxdim * bzdim)
  156. for (i = 0; i < nslicesy; i++)
  157. for (j = 0; j < nslicesx; j++)
  158. #pragma starpu register &C[i*xdim*bydim + j*bxdim*bydim] (bxdim * bydim)
  159. gettimeofday (&end_register, NULL);
  160. while (nloop--)
  161. {
  162. gettimeofday (&start_tasks, NULL);
  163. gettimeofday (&start_compute, NULL);
  164. for (i = 0; i < nslicesy; i++)
  165. for (j = 0; j < nslicesx; j++)
  166. for (k = 0; k < nslicesz; k++)
  167. /* Make an asynchronous call to `matmul', leading to the
  168. instantiation of a StarPU task executing in parallel. */
  169. matmul (&A[i * zdim * bydim + k * bzdim * bydim],
  170. &B[k * xdim * bzdim + j * bxdim * bzdim],
  171. &C[i * xdim * bydim + j * bxdim * bydim], bxdim,
  172. bydim, bzdim);
  173. gettimeofday (&end_tasks, NULL);
  174. starttaskstime[nloop] =
  175. (end_tasks.tv_sec - start_tasks.tv_sec) + (end_tasks.tv_usec -
  176. start_tasks.tv_usec) /
  177. 1000000.0;
  178. /* Wait for the asynchronous calls to complete. */
  179. #pragma starpu wait
  180. gettimeofday (&end_compute, NULL);
  181. computetime[nloop] =
  182. (end_compute.tv_sec - start_compute.tv_sec) + (end_compute.tv_usec -
  183. start_compute.
  184. tv_usec) / 1000000.0;
  185. }
  186. #if 0
  187. print_matrix (C, nslicesx, nslicesy, bxdim, bydim);
  188. #endif
  189. gettimeofday (&start_unregister, NULL);
  190. for (i = 0; i < nslicesy; i++)
  191. for (j = 0; j < nslicesz; j++)
  192. #pragma starpu unregister &A[i*zdim*bydim + j*bzdim*bydim]
  193. for (i = 0; i < nslicesz; i++)
  194. for (j = 0; j < nslicesx; j++)
  195. #pragma starpu unregister &B[i*xdim*bzdim + j*bxdim*bzdim]
  196. for (i = 0; i < nslicesy; i++)
  197. for (j = 0; j < nslicesx; j++)
  198. #pragma starpu unregister &C[i*xdim*bydim + j*bxdim*bydim]
  199. gettimeofday (&end_unregister, NULL);
  200. gettimeofday (&end_all, NULL);
  201. #pragma starpu shutdown
  202. printf ("total: %f\n",
  203. (end_all.tv_sec - start_all.tv_sec) + (end_all.tv_usec -
  204. start_all.tv_usec) /
  205. 1000000.0);
  206. printf ("register: %f\n",
  207. (end_register.tv_sec - start_register.tv_sec) +
  208. (end_register.tv_usec - start_register.tv_usec) / 1000000.0);
  209. printf ("unregister: %f\n",
  210. (end_unregister.tv_sec - start_unregister.tv_sec) +
  211. (end_unregister.tv_usec - start_unregister.tv_usec) / 1000000.0);
  212. printf ("mean task launch : %f\n", mean (starttaskstime, mloop));
  213. printf ("std task launch: %f\n", stddev (starttaskstime, mloop));
  214. printf ("mean compute: %f\n", mean (computetime, mloop));
  215. printf ("std compute: %f\n", stddev (computetime, mloop));
  216. printf ("Compute performance : %f GFLOPS\n",
  217. .002 * xdim * ydim * zdim / (mean (computetime, mloop) * 1000000));
  218. return EXIT_SUCCESS;
  219. }