14 years ago · af1a414ef3
--- a/.gitignore
+++ b/.gitignore
@@ -183,3 +183,4 @@ starpu.log
 
				 /gcc-plugin/tests/acquire
			
 
				 /gcc-plugin/tests/unregister
			
 
				 /gcc-plugin/tests/lib-user
			
 
				+/gcc-plugin/examples/matrix-mult
			
--- a/configure.ac
+++ b/configure.ac
@@ -1371,6 +1371,7 @@ AC_OUTPUT([
 
				 	gcc-plugin/src/Makefile
			
 
				 	gcc-plugin/tests/Makefile
			
 
				 	gcc-plugin/tests/run-test
			
 
				+	gcc-plugin/examples/Makefile
			
 
				 	socl/Makefile
			
 
				 	socl/src/Makefile
			
 
				 	libstarpu.pc
			
--- a/gcc-plugin/Makefile.am
+++ b/gcc-plugin/Makefile.am
@@ -13,6 +13,6 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-SUBDIRS = src tests
			
 
				+SUBDIRS = src tests examples
			
 
				 
			
 
				 EXTRA_DIST = COPYING README
			
--- a/gcc-plugin/examples/Makefile.am
+++ b/gcc-plugin/examples/Makefile.am
@@ -0,0 +1,20 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+noinst_PROGRAMS =				\
			
 
				+  matrix-mult
			
 
				+
			
 
				+AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
			
 
				+AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall
			
--- a/gcc-plugin/examples/matrix-mult.c
+++ b/gcc-plugin/examples/matrix-mult.c
@@ -0,0 +1,304 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+   Copyright (C) 2010 Sylvain Gault
			
 
				+
			
 
				+   StarPU is free software; you can redistribute it and/or modify
			
 
				+   it under the terms of the GNU Lesser General Public License as published by
			
 
				+   the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+   your option) any later version.
			
 
				+
			
 
				+   StarPU is distributed in the hope that it will be useful, but
			
 
				+   WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  */
			
 
				+
			
 
				+/* The classical matrix multiplication example, implemented as a StarPU
			
 
				+   task invoked for different slices of the matrices.  Each invocation may
			
 
				+   of course execute in parallel.  */
			
 
				+
			
 
				+#ifndef STARPU_GCC_PLUGIN
			
 
				+# error must be compiled with the StarPU GCC plug-in
			
 
				+#endif
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <math.h>
			
 
				+#include <time.h>
			
 
				+#include <stdint.h>
			
 
				+#include <sys/time.h>
			
 
				+
			
 
				+
			
 
				+/* Definition of the StarPU task and its CPU implementation.  */
			
 
				+
			
 
				+static void matmul (const float *A, const float *B, float *C,
			
 
				+		    size_t nx, size_t ny, size_t nz)
			
 
				+  __attribute__ ((task));
			
 
				+
			
 
				+static void matmul_cpu (const float *A, const float *B, float *C,
			
 
				+			size_t nx, size_t ny, size_t nz)
			
 
				+  __attribute__ ((task_implementation ("cpu", matmul)));
			
 
				+
			
 
				+
			
 
				+static void
			
 
				+matmul_cpu (const float *A, const float *B, float *C,
			
 
				+	    size_t nx, size_t ny, size_t nz)
			
 
				+{
			
 
				+  size_t i, j, k;
			
 
				+
			
 
				+  for (j = 0; j < ny; j++)
			
 
				+    for (i = 0; i < nx; i++)
			
 
				+      {
			
 
				+	for (k = 0; k < nz; k++)
			
 
				+	  C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
			
 
				+      }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+static void print_matrix (const float *M, size_t nslicesx,
			
 
				+			  size_t nslicesy, size_t bxdim,
			
 
				+			  size_t bydim)
			
 
				+  __attribute__ ((unused));
			
 
				+
			
 
				+static void
			
 
				+print_matrix (const float *M, size_t nslicesx, size_t nslicesy, size_t bxdim,
			
 
				+	      size_t bydim)
			
 
				+{
			
 
				+  size_t a, b, i, j;
			
 
				+
			
 
				+  for (b = 0; b < nslicesy; b++)
			
 
				+    for (j = 0; j < bydim; j++)
			
 
				+      {
			
 
				+	for (a = 0; a < nslicesx; a++)
			
 
				+	  {
			
 
				+	    for (i = 0; i < bxdim; i++)
			
 
				+	      printf ("%f ",
			
 
				+		      M[b * nslicesx * bxdim * bydim + j * bxdim +
			
 
				+			a * bxdim * bydim + i]);
			
 
				+	  }
			
 
				+	printf ("\b\n");
			
 
				+      }
			
 
				+
			
 
				+  printf ("\n");
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static float
			
 
				+my_rand (void)
			
 
				+{
			
 
				+  return (float) rand () / (float) RAND_MAX;
			
 
				+}
			
 
				+
			
 
				+static double
			
 
				+mean (const double *v, size_t size)
			
 
				+{
			
 
				+  double sum = 0;
			
 
				+  size_t i;
			
 
				+
			
 
				+  for (i = 0; i < size; i++)
			
 
				+    sum += v[i];
			
 
				+
			
 
				+  return sum / size;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static double
			
 
				+stddev (const double *v, size_t size)
			
 
				+{
			
 
				+  double m = mean (v, size);
			
 
				+  double sqsum = 0;
			
 
				+  size_t i;
			
 
				+
			
 
				+  for (i = 0; i < size; i++)
			
 
				+    sqsum += (v[i] - m) * (v[i] - m);
			
 
				+
			
 
				+  return sqrt (sqsum / size);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int
			
 
				+main (int argc, char **argv)
			
 
				+{
			
 
				+  int mloop, nloop = 0;
			
 
				+  size_t i, j, k;
			
 
				+  size_t nslicesx;
			
 
				+  size_t nslicesy;
			
 
				+  size_t nslicesz;
			
 
				+  size_t xdim, ydim, zdim;
			
 
				+  size_t bxdim, bydim, bzdim;
			
 
				+  struct timeval start_all, end_all;
			
 
				+  struct timeval start_register, end_register;
			
 
				+  struct timeval start_tasks, end_tasks;
			
 
				+  struct timeval start_unregister, end_unregister;
			
 
				+  struct timeval start_compute, end_compute;
			
 
				+
			
 
				+
			
 
				+  if (argc < 4)
			
 
				+    {
			
 
				+      fprintf (stderr, "Usage: %s NLOOPS MATRIX-SIZE NSLICES\n", argv[0]);
			
 
				+      return EXIT_FAILURE;
			
 
				+    }
			
 
				+
			
 
				+  mloop = nloop = atoi (argv[1]);
			
 
				+  zdim = ydim = xdim = atoi (argv[2]);
			
 
				+  nslicesz = nslicesy = nslicesx = atoi (argv[3]);
			
 
				+  bxdim = xdim / nslicesx;
			
 
				+  bydim = ydim / nslicesy;
			
 
				+  bzdim = zdim / nslicesz;
			
 
				+
			
 
				+  if (xdim % nslicesx)
			
 
				+    {
			
 
				+      fprintf (stderr, "MATRIX-SIZE must be a multiple of NSLICES\n");
			
 
				+      return EXIT_FAILURE;
			
 
				+    }
			
 
				+
			
 
				+  fprintf (stderr, "running %d loops with %lux%lux%lu matrices and %lux%lux%lu blocks...\n",
			
 
				+	   nloop,
			
 
				+	   xdim, ydim, zdim,
			
 
				+	   bxdim, bydim, bzdim);
			
 
				+
			
 
				+  double computetime[nloop];
			
 
				+  double starttaskstime[nloop];
			
 
				+
			
 
				+#pragma starpu initialize
			
 
				+
			
 
				+  gettimeofday (&start_all, NULL);
			
 
				+
			
 
				+  float *A, *B, *C;
			
 
				+
			
 
				+  A = malloc (zdim * ydim * sizeof *A);
			
 
				+  B = malloc (xdim * zdim * sizeof *B);
			
 
				+  C = malloc (xdim * ydim * sizeof *C);
			
 
				+
			
 
				+  srand (time (NULL));
			
 
				+  for (i = 0; i < zdim * ydim; i++)
			
 
				+    A[i] = my_rand () * 100;
			
 
				+
			
 
				+  for (i = 0; i < xdim * zdim; i++)
			
 
				+    B[i] = my_rand () * 100;
			
 
				+
			
 
				+#if 0
			
 
				+  print_matrix (A, nslicesz, nslicesy, bzdim, bydim);
			
 
				+  print_matrix (B, nslicesx, nslicesz, bxdim, bzdim);
			
 
				+#endif
			
 
				+
			
 
				+  for (i = 0; i < xdim * ydim; i++)
			
 
				+    C[i] = 0;
			
 
				+
			
 
				+  gettimeofday (&start_register, NULL);
			
 
				+  for (i = 0; i < nslicesy; i++)
			
 
				+    for (j = 0; j < nslicesz; j++)
			
 
				+      {
			
 
				+	/* TODO: Get rid of the `ptr' and `size' variables when the pragma
			
 
				+	   parser supports arbitrary C expressions.  */
			
 
				+
			
 
				+	typeof (A) ptr = &A[i*zdim*bydim + j*bzdim*bydim];
			
 
				+	size_t size = (bzdim * bydim);
			
 
				+#pragma starpu register ptr size
			
 
				+      }
			
 
				+
			
 
				+  for (i = 0; i < nslicesz; i++)
			
 
				+    {
			
 
				+      for (j = 0; j < nslicesx; j++)
			
 
				+	{
			
 
				+	  typeof (B) ptr = &B[i*xdim*bzdim + j*bxdim*bzdim];
			
 
				+	  size_t size = (bxdim * bzdim);
			
 
				+#pragma starpu register ptr size
			
 
				+	}
			
 
				+    }
			
 
				+
			
 
				+  for (i = 0; i < nslicesy; i++)
			
 
				+    {
			
 
				+      for (j = 0; j < nslicesx; j++)
			
 
				+	{
			
 
				+	  typeof (C) ptr = &C[i*xdim*bydim + j*bxdim*bydim];
			
 
				+	  size_t size = (bxdim * bydim);
			
 
				+#pragma starpu register ptr size
			
 
				+	}
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+  gettimeofday (&end_register, NULL);
			
 
				+
			
 
				+  while (nloop--)
			
 
				+    {
			
 
				+      gettimeofday (&start_tasks, NULL);
			
 
				+      gettimeofday (&start_compute, NULL);
			
 
				+      for (i = 0; i < nslicesy; i++)
			
 
				+	for (j = 0; j < nslicesx; j++)
			
 
				+	  for (k = 0; k < nslicesz; k++)
			
 
				+	    /* Make an asynchronous call to `matmul', leading to the
			
 
				+	       instantiation of a StarPU task executing in parallel.  */
			
 
				+	    matmul (&A[i * zdim * bydim + k * bzdim * bydim],
			
 
				+		    &B[k * xdim * bzdim + j * bxdim * bzdim],
			
 
				+		    &C[i * xdim * bydim + j * bxdim * bydim], bxdim,
			
 
				+		    bydim, bzdim);
			
 
				+
			
 
				+      gettimeofday (&end_tasks, NULL);
			
 
				+      starttaskstime[nloop] =
			
 
				+	(end_tasks.tv_sec - start_tasks.tv_sec) + (end_tasks.tv_usec -
			
 
				+						   start_tasks.tv_usec) /
			
 
				+	1000000.0;
			
 
				+
			
 
				+      /* Wait for the asynchronous calls to complete.  */
			
 
				+#pragma starpu wait
			
 
				+
			
 
				+      gettimeofday (&end_compute, NULL);
			
 
				+      computetime[nloop] =
			
 
				+	(end_compute.tv_sec - start_compute.tv_sec) + (end_compute.tv_usec -
			
 
				+						       start_compute.
			
 
				+						       tv_usec) / 1000000.0;
			
 
				+    }
			
 
				+
			
 
				+#if 0
			
 
				+  print_matrix (C, nslicesx, nslicesy, bxdim, bydim);
			
 
				+#endif
			
 
				+
			
 
				+  gettimeofday (&start_unregister, NULL);
			
 
				+  for (i = 0; i < nslicesy; i++)
			
 
				+    for (j = 0; j < nslicesz; j++)
			
 
				+      {
			
 
				+	typeof (A) ptr =  &A[i*zdim*bydim + j*bzdim*bydim];
			
 
				+#pragma starpu unregister ptr
			
 
				+      }
			
 
				+
			
 
				+
			
 
				+  for (i = 0; i < nslicesz; i++)
			
 
				+    for (j = 0; j < nslicesx; j++)
			
 
				+      {
			
 
				+	typeof (B) ptr = &B[i*xdim*bzdim + j*bxdim*bzdim];
			
 
				+#pragma starpu unregister ptr
			
 
				+      }
			
 
				+
			
 
				+  for (i = 0; i < nslicesy; i++)
			
 
				+    for (j = 0; j < nslicesx; j++)
			
 
				+      {
			
 
				+	typeof (C) ptr = &C[i*xdim*bydim + j*bxdim*bydim];
			
 
				+#pragma starpu unregister ptr
			
 
				+      }
			
 
				+
			
 
				+  gettimeofday (&end_unregister, NULL);
			
 
				+  gettimeofday (&end_all, NULL);
			
 
				+
			
 
				+#pragma starpu shutdown
			
 
				+
			
 
				+  printf ("total: %f\n",
			
 
				+	  (end_all.tv_sec - start_all.tv_sec) + (end_all.tv_usec -
			
 
				+						 start_all.tv_usec) /
			
 
				+	  1000000.0);
			
 
				+  printf ("register: %f\n",
			
 
				+	  (end_register.tv_sec - start_register.tv_sec) +
			
 
				+	  (end_register.tv_usec - start_register.tv_usec) / 1000000.0);
			
 
				+  printf ("unregister: %f\n",
			
 
				+	  (end_unregister.tv_sec - start_unregister.tv_sec) +
			
 
				+	  (end_unregister.tv_usec - start_unregister.tv_usec) / 1000000.0);
			
 
				+
			
 
				+  printf ("mean task launch : %f\n", mean (starttaskstime, mloop));
			
 
				+  printf ("std task launch: %f\n", stddev (starttaskstime, mloop));
			
 
				+
			
 
				+  printf ("mean compute: %f\n", mean (computetime, mloop));
			
 
				+  printf ("std compute: %f\n", stddev (computetime, mloop));
			
 
				+  printf ("Compute performance : %f GFLOPS\n",
			
 
				+	  .002 * xdim * ydim * zdim / (mean (computetime, mloop) * 1000000));
			
 
				+
			
 
				+  return EXIT_SUCCESS;
			
 
				+}