|
@@ -0,0 +1,304 @@
|
|
|
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
|
+ Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
|
|
|
+ Copyright (C) 2010 Sylvain Gault
|
|
|
+
|
|
|
+ StarPU is free software; you can redistribute it and/or modify
|
|
|
+ it under the terms of the GNU Lesser General Public License as published by
|
|
|
+ the Free Software Foundation; either version 2.1 of the License, or (at
|
|
|
+ your option) any later version.
|
|
|
+
|
|
|
+ StarPU is distributed in the hope that it will be useful, but
|
|
|
+ WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */
|
|
|
+
|
|
|
+/* The classical matrix multiplication example, implemented as a StarPU
|
|
|
+ task invoked for different slices of the matrices. Each invocation may
|
|
|
+ of course execute in parallel. */
|
|
|
+
|
|
|
+#ifndef STARPU_GCC_PLUGIN
|
|
|
+# error must be compiled with the StarPU GCC plug-in
|
|
|
+#endif
|
|
|
+
|
|
|
+#include <stdio.h>
|
|
|
+#include <stdlib.h>
|
|
|
+#include <math.h>
|
|
|
+#include <time.h>
|
|
|
+#include <stdint.h>
|
|
|
+#include <sys/time.h>
|
|
|
+
|
|
|
+
|
|
|
+/* Definition of the StarPU task and its CPU implementation. */
|
|
|
+
|
|
|
+static void matmul (const float *A, const float *B, float *C,
|
|
|
+ size_t nx, size_t ny, size_t nz)
|
|
|
+ __attribute__ ((task));
|
|
|
+
|
|
|
+static void matmul_cpu (const float *A, const float *B, float *C,
|
|
|
+ size_t nx, size_t ny, size_t nz)
|
|
|
+ __attribute__ ((task_implementation ("cpu", matmul)));
|
|
|
+
|
|
|
+
|
|
|
+static void
|
|
|
+matmul_cpu (const float *A, const float *B, float *C,
|
|
|
+ size_t nx, size_t ny, size_t nz)
|
|
|
+{
|
|
|
+ size_t i, j, k;
|
|
|
+
|
|
|
+ for (j = 0; j < ny; j++)
|
|
|
+ for (i = 0; i < nx; i++)
|
|
|
+ {
|
|
|
+ for (k = 0; k < nz; k++)
|
|
|
+ C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+static void print_matrix (const float *M, size_t nslicesx,
|
|
|
+ size_t nslicesy, size_t bxdim,
|
|
|
+ size_t bydim)
|
|
|
+ __attribute__ ((unused));
|
|
|
+
|
|
|
+static void
|
|
|
+print_matrix (const float *M, size_t nslicesx, size_t nslicesy, size_t bxdim,
|
|
|
+ size_t bydim)
|
|
|
+{
|
|
|
+ size_t a, b, i, j;
|
|
|
+
|
|
|
+ for (b = 0; b < nslicesy; b++)
|
|
|
+ for (j = 0; j < bydim; j++)
|
|
|
+ {
|
|
|
+ for (a = 0; a < nslicesx; a++)
|
|
|
+ {
|
|
|
+ for (i = 0; i < bxdim; i++)
|
|
|
+ printf ("%f ",
|
|
|
+ M[b * nslicesx * bxdim * bydim + j * bxdim +
|
|
|
+ a * bxdim * bydim + i]);
|
|
|
+ }
|
|
|
+ printf ("\b\n");
|
|
|
+ }
|
|
|
+
|
|
|
+ printf ("\n");
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+static float
|
|
|
+my_rand (void)
|
|
|
+{
|
|
|
+ return (float) rand () / (float) RAND_MAX;
|
|
|
+}
|
|
|
+
|
|
|
+static double
|
|
|
+mean (const double *v, size_t size)
|
|
|
+{
|
|
|
+ double sum = 0;
|
|
|
+ size_t i;
|
|
|
+
|
|
|
+ for (i = 0; i < size; i++)
|
|
|
+ sum += v[i];
|
|
|
+
|
|
|
+ return sum / size;
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+static double
|
|
|
+stddev (const double *v, size_t size)
|
|
|
+{
|
|
|
+ double m = mean (v, size);
|
|
|
+ double sqsum = 0;
|
|
|
+ size_t i;
|
|
|
+
|
|
|
+ for (i = 0; i < size; i++)
|
|
|
+ sqsum += (v[i] - m) * (v[i] - m);
|
|
|
+
|
|
|
+ return sqrt (sqsum / size);
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+int
|
|
|
+main (int argc, char **argv)
|
|
|
+{
|
|
|
+ int mloop, nloop = 0;
|
|
|
+ size_t i, j, k;
|
|
|
+ size_t nslicesx;
|
|
|
+ size_t nslicesy;
|
|
|
+ size_t nslicesz;
|
|
|
+ size_t xdim, ydim, zdim;
|
|
|
+ size_t bxdim, bydim, bzdim;
|
|
|
+ struct timeval start_all, end_all;
|
|
|
+ struct timeval start_register, end_register;
|
|
|
+ struct timeval start_tasks, end_tasks;
|
|
|
+ struct timeval start_unregister, end_unregister;
|
|
|
+ struct timeval start_compute, end_compute;
|
|
|
+
|
|
|
+
|
|
|
+ if (argc < 4)
|
|
|
+ {
|
|
|
+ fprintf (stderr, "Usage: %s NLOOPS MATRIX-SIZE NSLICES\n", argv[0]);
|
|
|
+ return EXIT_FAILURE;
|
|
|
+ }
|
|
|
+
|
|
|
+ mloop = nloop = atoi (argv[1]);
|
|
|
+ zdim = ydim = xdim = atoi (argv[2]);
|
|
|
+ nslicesz = nslicesy = nslicesx = atoi (argv[3]);
|
|
|
+ bxdim = xdim / nslicesx;
|
|
|
+ bydim = ydim / nslicesy;
|
|
|
+ bzdim = zdim / nslicesz;
|
|
|
+
|
|
|
+ if (xdim % nslicesx)
|
|
|
+ {
|
|
|
+ fprintf (stderr, "MATRIX-SIZE must be a multiple of NSLICES\n");
|
|
|
+ return EXIT_FAILURE;
|
|
|
+ }
|
|
|
+
|
|
|
+ fprintf (stderr, "running %d loops with %lux%lux%lu matrices and %lux%lux%lu blocks...\n",
|
|
|
+ nloop,
|
|
|
+ xdim, ydim, zdim,
|
|
|
+ bxdim, bydim, bzdim);
|
|
|
+
|
|
|
+ double computetime[nloop];
|
|
|
+ double starttaskstime[nloop];
|
|
|
+
|
|
|
+#pragma starpu initialize
|
|
|
+
|
|
|
+ gettimeofday (&start_all, NULL);
|
|
|
+
|
|
|
+ float *A, *B, *C;
|
|
|
+
|
|
|
+ A = malloc (zdim * ydim * sizeof *A);
|
|
|
+ B = malloc (xdim * zdim * sizeof *B);
|
|
|
+ C = malloc (xdim * ydim * sizeof *C);
|
|
|
+
|
|
|
+ srand (time (NULL));
|
|
|
+ for (i = 0; i < zdim * ydim; i++)
|
|
|
+ A[i] = my_rand () * 100;
|
|
|
+
|
|
|
+ for (i = 0; i < xdim * zdim; i++)
|
|
|
+ B[i] = my_rand () * 100;
|
|
|
+
|
|
|
+#if 0
|
|
|
+ print_matrix (A, nslicesz, nslicesy, bzdim, bydim);
|
|
|
+ print_matrix (B, nslicesx, nslicesz, bxdim, bzdim);
|
|
|
+#endif
|
|
|
+
|
|
|
+ for (i = 0; i < xdim * ydim; i++)
|
|
|
+ C[i] = 0;
|
|
|
+
|
|
|
+ gettimeofday (&start_register, NULL);
|
|
|
+ for (i = 0; i < nslicesy; i++)
|
|
|
+ for (j = 0; j < nslicesz; j++)
|
|
|
+ {
|
|
|
+ /* TODO: Get rid of the `ptr' and `size' variables when the pragma
|
|
|
+ parser supports arbitrary C expressions. */
|
|
|
+
|
|
|
+ typeof (A) ptr = &A[i*zdim*bydim + j*bzdim*bydim];
|
|
|
+ size_t size = (bzdim * bydim);
|
|
|
+#pragma starpu register ptr size
|
|
|
+ }
|
|
|
+
|
|
|
+ for (i = 0; i < nslicesz; i++)
|
|
|
+ {
|
|
|
+ for (j = 0; j < nslicesx; j++)
|
|
|
+ {
|
|
|
+ typeof (B) ptr = &B[i*xdim*bzdim + j*bxdim*bzdim];
|
|
|
+ size_t size = (bxdim * bzdim);
|
|
|
+#pragma starpu register ptr size
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (i = 0; i < nslicesy; i++)
|
|
|
+ {
|
|
|
+ for (j = 0; j < nslicesx; j++)
|
|
|
+ {
|
|
|
+ typeof (C) ptr = &C[i*xdim*bydim + j*bxdim*bydim];
|
|
|
+ size_t size = (bxdim * bydim);
|
|
|
+#pragma starpu register ptr size
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ gettimeofday (&end_register, NULL);
|
|
|
+
|
|
|
+ while (nloop--)
|
|
|
+ {
|
|
|
+ gettimeofday (&start_tasks, NULL);
|
|
|
+ gettimeofday (&start_compute, NULL);
|
|
|
+ for (i = 0; i < nslicesy; i++)
|
|
|
+ for (j = 0; j < nslicesx; j++)
|
|
|
+ for (k = 0; k < nslicesz; k++)
|
|
|
+ /* Make an asynchronous call to `matmul', leading to the
|
|
|
+ instantiation of a StarPU task executing in parallel. */
|
|
|
+ matmul (&A[i * zdim * bydim + k * bzdim * bydim],
|
|
|
+ &B[k * xdim * bzdim + j * bxdim * bzdim],
|
|
|
+ &C[i * xdim * bydim + j * bxdim * bydim], bxdim,
|
|
|
+ bydim, bzdim);
|
|
|
+
|
|
|
+ gettimeofday (&end_tasks, NULL);
|
|
|
+ starttaskstime[nloop] =
|
|
|
+ (end_tasks.tv_sec - start_tasks.tv_sec) + (end_tasks.tv_usec -
|
|
|
+ start_tasks.tv_usec) /
|
|
|
+ 1000000.0;
|
|
|
+
|
|
|
+ /* Wait for the asynchronous calls to complete. */
|
|
|
+#pragma starpu wait
|
|
|
+
|
|
|
+ gettimeofday (&end_compute, NULL);
|
|
|
+ computetime[nloop] =
|
|
|
+ (end_compute.tv_sec - start_compute.tv_sec) + (end_compute.tv_usec -
|
|
|
+ start_compute.
|
|
|
+ tv_usec) / 1000000.0;
|
|
|
+ }
|
|
|
+
|
|
|
+#if 0
|
|
|
+ print_matrix (C, nslicesx, nslicesy, bxdim, bydim);
|
|
|
+#endif
|
|
|
+
|
|
|
+ gettimeofday (&start_unregister, NULL);
|
|
|
+ for (i = 0; i < nslicesy; i++)
|
|
|
+ for (j = 0; j < nslicesz; j++)
|
|
|
+ {
|
|
|
+ typeof (A) ptr = &A[i*zdim*bydim + j*bzdim*bydim];
|
|
|
+#pragma starpu unregister ptr
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ for (i = 0; i < nslicesz; i++)
|
|
|
+ for (j = 0; j < nslicesx; j++)
|
|
|
+ {
|
|
|
+ typeof (B) ptr = &B[i*xdim*bzdim + j*bxdim*bzdim];
|
|
|
+#pragma starpu unregister ptr
|
|
|
+ }
|
|
|
+
|
|
|
+ for (i = 0; i < nslicesy; i++)
|
|
|
+ for (j = 0; j < nslicesx; j++)
|
|
|
+ {
|
|
|
+ typeof (C) ptr = &C[i*xdim*bydim + j*bxdim*bydim];
|
|
|
+#pragma starpu unregister ptr
|
|
|
+ }
|
|
|
+
|
|
|
+ gettimeofday (&end_unregister, NULL);
|
|
|
+ gettimeofday (&end_all, NULL);
|
|
|
+
|
|
|
+#pragma starpu shutdown
|
|
|
+
|
|
|
+ printf ("total: %f\n",
|
|
|
+ (end_all.tv_sec - start_all.tv_sec) + (end_all.tv_usec -
|
|
|
+ start_all.tv_usec) /
|
|
|
+ 1000000.0);
|
|
|
+ printf ("register: %f\n",
|
|
|
+ (end_register.tv_sec - start_register.tv_sec) +
|
|
|
+ (end_register.tv_usec - start_register.tv_usec) / 1000000.0);
|
|
|
+ printf ("unregister: %f\n",
|
|
|
+ (end_unregister.tv_sec - start_unregister.tv_sec) +
|
|
|
+ (end_unregister.tv_usec - start_unregister.tv_usec) / 1000000.0);
|
|
|
+
|
|
|
+ printf ("mean task launch : %f\n", mean (starttaskstime, mloop));
|
|
|
+ printf ("std task launch: %f\n", stddev (starttaskstime, mloop));
|
|
|
+
|
|
|
+ printf ("mean compute: %f\n", mean (computetime, mloop));
|
|
|
+ printf ("std compute: %f\n", stddev (computetime, mloop));
|
|
|
+ printf ("Compute performance : %f GFLOPS\n",
|
|
|
+ .002 * xdim * ydim * zdim / (mean (computetime, mloop) * 1000000));
|
|
|
+
|
|
|
+ return EXIT_SUCCESS;
|
|
|
+}
|