| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277 | 
							- /* StarPU --- Runtime system for heterogeneous multicore architectures.
 
-    Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
 
-    Copyright (C) 2010 Sylvain Gault
 
-    StarPU is free software; you can redistribute it and/or modify
 
-    it under the terms of the GNU Lesser General Public License as published by
 
-    the Free Software Foundation; either version 2.1 of the License, or (at
 
-    your option) any later version.
 
-    StarPU is distributed in the hope that it will be useful, but
 
-    WITHOUT ANY WARRANTY; without even the implied warranty of
 
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  */
 
- /* The classical matrix multiplication example, implemented as a StarPU
 
-    task invoked for different slices of the matrices.  Each invocation may
 
-    of course execute in parallel.  */
 
- #ifndef STARPU_GCC_PLUGIN
 
- # error must be compiled with the StarPU GCC plug-in
 
- #endif
 
- #include <stdio.h>
 
- #include <stdlib.h>
 
- #include <math.h>
 
- #include <time.h>
 
- #include <stdint.h>
 
- #include <sys/time.h>
 
- /* Definition of the StarPU task and its CPU implementation.  */
 
- static void matmul (const float *A, const float *B, float *C,
 
- 		    size_t nx, size_t ny, size_t nz)
 
-   __attribute__ ((task));
 
- static void matmul_cpu (const float *A, const float *B, float *C,
 
- 			size_t nx, size_t ny, size_t nz)
 
-   __attribute__ ((task_implementation ("cpu", matmul)));
 
- static void
 
- matmul_cpu (const float *A, const float *B, float *C,
 
- 	    size_t nx, size_t ny, size_t nz)
 
- {
 
-   size_t i, j, k;
 
-   for (j = 0; j < ny; j++)
 
-     for (i = 0; i < nx; i++)
 
-       {
 
- 	for (k = 0; k < nz; k++)
 
- 	  C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
 
-       }
 
- }
 
- static void print_matrix (const float *M, size_t nslicesx,
 
- 			  size_t nslicesy, size_t bxdim,
 
- 			  size_t bydim)
 
-   __attribute__ ((unused));
 
- static void
 
- print_matrix (const float *M, size_t nslicesx, size_t nslicesy, size_t bxdim,
 
- 	      size_t bydim)
 
- {
 
-   size_t a, b, i, j;
 
-   for (b = 0; b < nslicesy; b++)
 
-     for (j = 0; j < bydim; j++)
 
-       {
 
- 	for (a = 0; a < nslicesx; a++)
 
- 	  {
 
- 	    for (i = 0; i < bxdim; i++)
 
- 	      printf ("%f ",
 
- 		      M[b * nslicesx * bxdim * bydim + j * bxdim +
 
- 			a * bxdim * bydim + i]);
 
- 	  }
 
- 	printf ("\b\n");
 
-       }
 
-   printf ("\n");
 
- }
 
- static float
 
- my_rand (void)
 
- {
 
-   return (float) rand () / (float) RAND_MAX;
 
- }
 
- static double
 
- mean (const double *v, size_t size)
 
- {
 
-   double sum = 0;
 
-   size_t i;
 
-   for (i = 0; i < size; i++)
 
-     sum += v[i];
 
-   return sum / size;
 
- }
 
- static double
 
- stddev (const double *v, size_t size)
 
- {
 
-   double m = mean (v, size);
 
-   double sqsum = 0;
 
-   size_t i;
 
-   for (i = 0; i < size; i++)
 
-     sqsum += (v[i] - m) * (v[i] - m);
 
-   return sqrt (sqsum / size);
 
- }
 
- int
 
- main (int argc, char **argv)
 
- {
 
-   int mloop, nloop = 0;
 
-   size_t i, j, k;
 
-   size_t nslicesx;
 
-   size_t nslicesy;
 
-   size_t nslicesz;
 
-   size_t xdim, ydim, zdim;
 
-   size_t bxdim, bydim, bzdim;
 
-   struct timeval start_all, end_all;
 
-   struct timeval start_register, end_register;
 
-   struct timeval start_tasks, end_tasks;
 
-   struct timeval start_unregister, end_unregister;
 
-   struct timeval start_compute, end_compute;
 
-   if (argc < 4)
 
-     {
 
-       fprintf (stderr, "Usage: %s NLOOPS MATRIX-SIZE NSLICES\n", argv[0]);
 
-       return EXIT_FAILURE;
 
-     }
 
-   mloop = nloop = atoi (argv[1]);
 
-   zdim = ydim = xdim = atoi (argv[2]);
 
-   nslicesz = nslicesy = nslicesx = atoi (argv[3]);
 
-   bxdim = xdim / nslicesx;
 
-   bydim = ydim / nslicesy;
 
-   bzdim = zdim / nslicesz;
 
-   if (xdim % nslicesx)
 
-     {
 
-       fprintf (stderr, "MATRIX-SIZE must be a multiple of NSLICES\n");
 
-       return EXIT_FAILURE;
 
-     }
 
-   fprintf (stderr, "running %d loops with %lux%lux%lu matrices and %lux%lux%lu blocks...\n",
 
- 	   nloop,
 
- 	   xdim, ydim, zdim,
 
- 	   bxdim, bydim, bzdim);
 
-   double computetime[nloop];
 
-   double starttaskstime[nloop];
 
- #pragma starpu initialize
 
-   gettimeofday (&start_all, NULL);
 
-   float *A, *B, *C;
 
-   A = malloc (zdim * ydim * sizeof *A);
 
-   B = malloc (xdim * zdim * sizeof *B);
 
-   C = malloc (xdim * ydim * sizeof *C);
 
-   srand (time (NULL));
 
-   for (i = 0; i < zdim * ydim; i++)
 
-     A[i] = my_rand () * 100;
 
-   for (i = 0; i < xdim * zdim; i++)
 
-     B[i] = my_rand () * 100;
 
- #if 0
 
-   print_matrix (A, nslicesz, nslicesy, bzdim, bydim);
 
-   print_matrix (B, nslicesx, nslicesz, bxdim, bzdim);
 
- #endif
 
-   for (i = 0; i < xdim * ydim; i++)
 
-     C[i] = 0;
 
-   gettimeofday (&start_register, NULL);
 
-   for (i = 0; i < nslicesy; i++)
 
-     for (j = 0; j < nslicesz; j++)
 
- #pragma starpu register &A[i*zdim*bydim + j*bzdim*bydim] (bzdim * bydim)
 
-   for (i = 0; i < nslicesz; i++)
 
-     for (j = 0; j < nslicesx; j++)
 
- #pragma starpu register &B[i*xdim*bzdim + j*bxdim*bzdim] (bxdim * bzdim)
 
-   for (i = 0; i < nslicesy; i++)
 
-     for (j = 0; j < nslicesx; j++)
 
- #pragma starpu register &C[i*xdim*bydim + j*bxdim*bydim] (bxdim * bydim)
 
-   gettimeofday (&end_register, NULL);
 
-   while (nloop--)
 
-     {
 
-       gettimeofday (&start_tasks, NULL);
 
-       gettimeofday (&start_compute, NULL);
 
-       for (i = 0; i < nslicesy; i++)
 
- 	for (j = 0; j < nslicesx; j++)
 
- 	  for (k = 0; k < nslicesz; k++)
 
- 	    /* Make an asynchronous call to `matmul', leading to the
 
- 	       instantiation of a StarPU task executing in parallel.  */
 
- 	    matmul (&A[i * zdim * bydim + k * bzdim * bydim],
 
- 		    &B[k * xdim * bzdim + j * bxdim * bzdim],
 
- 		    &C[i * xdim * bydim + j * bxdim * bydim], bxdim,
 
- 		    bydim, bzdim);
 
-       gettimeofday (&end_tasks, NULL);
 
-       starttaskstime[nloop] =
 
- 	(end_tasks.tv_sec - start_tasks.tv_sec) + (end_tasks.tv_usec -
 
- 						   start_tasks.tv_usec) /
 
- 	1000000.0;
 
-       /* Wait for the asynchronous calls to complete.  */
 
- #pragma starpu wait
 
-       gettimeofday (&end_compute, NULL);
 
-       computetime[nloop] =
 
- 	(end_compute.tv_sec - start_compute.tv_sec) + (end_compute.tv_usec -
 
- 						       start_compute.
 
- 						       tv_usec) / 1000000.0;
 
-     }
 
- #if 0
 
-   print_matrix (C, nslicesx, nslicesy, bxdim, bydim);
 
- #endif
 
-   gettimeofday (&start_unregister, NULL);
 
-   for (i = 0; i < nslicesy; i++)
 
-     for (j = 0; j < nslicesz; j++)
 
- #pragma starpu unregister &A[i*zdim*bydim + j*bzdim*bydim]
 
-   for (i = 0; i < nslicesz; i++)
 
-     for (j = 0; j < nslicesx; j++)
 
- #pragma starpu unregister &B[i*xdim*bzdim + j*bxdim*bzdim]
 
-   for (i = 0; i < nslicesy; i++)
 
-     for (j = 0; j < nslicesx; j++)
 
- #pragma starpu unregister &C[i*xdim*bydim + j*bxdim*bydim]
 
-   gettimeofday (&end_unregister, NULL);
 
-   gettimeofday (&end_all, NULL);
 
- #pragma starpu shutdown
 
-   printf ("total: %f\n",
 
- 	  (end_all.tv_sec - start_all.tv_sec) + (end_all.tv_usec -
 
- 						 start_all.tv_usec) /
 
- 	  1000000.0);
 
-   printf ("register: %f\n",
 
- 	  (end_register.tv_sec - start_register.tv_sec) +
 
- 	  (end_register.tv_usec - start_register.tv_usec) / 1000000.0);
 
-   printf ("unregister: %f\n",
 
- 	  (end_unregister.tv_sec - start_unregister.tv_sec) +
 
- 	  (end_unregister.tv_usec - start_unregister.tv_usec) / 1000000.0);
 
-   printf ("mean task launch : %f\n", mean (starttaskstime, mloop));
 
-   printf ("std task launch: %f\n", stddev (starttaskstime, mloop));
 
-   printf ("mean compute: %f\n", mean (computetime, mloop));
 
-   printf ("std compute: %f\n", stddev (computetime, mloop));
 
-   printf ("Compute performance : %f GFLOPS\n",
 
- 	  .002 * xdim * ydim * zdim / (mean (computetime, mloop) * 1000000));
 
-   return EXIT_SUCCESS;
 
- }
 
 
  |