| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283 | /* StarPU --- Runtime system for heterogeneous multicore architectures.   Copyright (C) 2011, 2012 INRIA   Copyright (C) 2010 Sylvain Gault   StarPU is free software; you can redistribute it and/or modify   it under the terms of the GNU Lesser General Public License as published by   the Free Software Foundation; either version 2.1 of the License, or (at   your option) any later version.   StarPU is distributed in the hope that it will be useful, but   WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  *//* The classical matrix multiplication example, implemented as a StarPU   task invoked for different slices of the matrices.  Each invocation may   of course execute in parallel.  */#ifndef STARPU_GCC_PLUGIN# error must be compiled with the StarPU GCC plug-in#endif/* Convenience macro.  */#define __heap __attribute__ ((__heap_allocated__))#include <stdio.h>#include <stdlib.h>#include <math.h>#include <time.h>#include <stdint.h>#include <sys/time.h>/* Definition of the StarPU task and its CPU implementation.  */static void matmul (const float *A, const float *B, float *C,		    size_t nx, size_t ny, size_t nz)  __attribute__ ((task));static void matmul_cpu (const float *A, const float *B, float *C,			size_t nx, size_t ny, size_t nz)  __attribute__ ((task_implementation ("cpu", matmul)));static voidmatmul_cpu (const float *A, const float *B, float *C,	    size_t nx, size_t ny, size_t nz){  size_t i, j, k;  for (j = 0; j < ny; j++)    for (i = 0; i < nx; i++)      {	for (k = 0; k < nz; k++)	  C[j * nx + i] += A[j * nz + k] * B[k * nx + i];      }}static void print_matrix (const float *M, size_t nslicesx,			  size_t nslicesy, size_t bxdim,			  size_t bydim)  __attribute__ ((unused));static voidprint_matrix (const float *M, size_t nslicesx, size_t nslicesy, size_t bxdim,	      size_t bydim){  size_t a, b, i, j;  for (b = 0; b < nslicesy; b++)    for (j = 0; j < bydim; j++)      {	for (a = 0; a < nslicesx; a++)	  {	    for (i = 0; i < bxdim; i++)	      printf ("%f ",		      M[b * nslicesx * bxdim * bydim + j * bxdim +			a * bxdim * bydim + i]);	  }	printf ("\b\n");      }  printf ("\n");}static floatmy_rand (void){  return (float) rand () / (float) RAND_MAX;}static doublemean (const double *v, size_t size){  double sum = 0;  size_t i;  for (i = 0; i < size; i++)    sum += v[i];  return sum / size;}static doublestddev (const double *v, size_t size){  double m = mean (v, size);  double sqsum = 0;  size_t i;  for (i = 0; i < size; i++)    sqsum += (v[i] - m) * (v[i] - m);  return sqrt (sqsum / size);}intmain (int argc, char **argv){  int mloop, nloop = 0;  size_t i, j, k;  size_t nslicesx;  size_t nslicesy;  size_t nslicesz;  size_t xdim, ydim, zdim;  size_t bxdim, bydim, bzdim;  struct timeval start_all, end_all;  struct timeval start_register, end_register;  struct timeval start_tasks, end_tasks;  struct timeval start_unregister, end_unregister;  struct timeval start_compute, end_compute;  if (argc < 4)    {      fprintf (stderr, "Using default values.\nCorrect usage: %s NLOOPS MATRIX-SIZE NSLICES\n", argv[0]);      mloop = nloop = 10;      zdim = ydim = xdim = 16;      nslicesz = nslicesy = nslicesx = 4;    }  else    {      mloop = nloop = atoi (argv[1]);      zdim = ydim = xdim = atoi (argv[2]);      nslicesz = nslicesy = nslicesx = atoi (argv[3]);    }  bxdim = xdim / nslicesx;  bydim = ydim / nslicesy;  bzdim = zdim / nslicesz;  if (xdim % nslicesx)    {      fprintf (stderr, "MATRIX-SIZE must be a multiple of NSLICES\n");      return EXIT_FAILURE;    }  fprintf (stderr, "running %d loops with %lux%lux%lu matrices and %lux%lux%lu blocks...\n",	   nloop,	   xdim, ydim, zdim,	   bxdim, bydim, bzdim);  double computetime[nloop];  double starttaskstime[nloop];#pragma starpu initialize  gettimeofday (&start_all, NULL);  float A[zdim * ydim];  float B[xdim * zdim];  float C[xdim * ydim];  srand (time (NULL));  for (i = 0; i < zdim * ydim; i++)    A[i] = my_rand () * 100;  for (i = 0; i < xdim * zdim; i++)    B[i] = my_rand () * 100;#if 0  print_matrix (A, nslicesz, nslicesy, bzdim, bydim);  print_matrix (B, nslicesx, nslicesz, bxdim, bzdim);#endif  for (i = 0; i < xdim * ydim; i++)    C[i] = 0;  gettimeofday (&start_register, NULL);  for (i = 0; i < nslicesy; i++)    for (j = 0; j < nslicesz; j++)#pragma starpu register &A[i*zdim*bydim + j*bzdim*bydim] (bzdim * bydim)  for (i = 0; i < nslicesz; i++)    for (j = 0; j < nslicesx; j++)#pragma starpu register &B[i*xdim*bzdim + j*bxdim*bzdim] (bxdim * bzdim)  for (i = 0; i < nslicesy; i++)    for (j = 0; j < nslicesx; j++)#pragma starpu register &C[i*xdim*bydim + j*bxdim*bydim] (bxdim * bydim)  gettimeofday (&end_register, NULL);  while (nloop--)    {      gettimeofday (&start_tasks, NULL);      gettimeofday (&start_compute, NULL);      for (i = 0; i < nslicesy; i++)	for (j = 0; j < nslicesx; j++)	  for (k = 0; k < nslicesz; k++)	    /* Make an asynchronous call to `matmul', leading to the	       instantiation of a StarPU task executing in parallel.  */	    matmul (&A[i * zdim * bydim + k * bzdim * bydim],		    &B[k * xdim * bzdim + j * bxdim * bzdim],		    &C[i * xdim * bydim + j * bxdim * bydim], bxdim,		    bydim, bzdim);      gettimeofday (&end_tasks, NULL);      starttaskstime[nloop] =	(end_tasks.tv_sec - start_tasks.tv_sec) + (end_tasks.tv_usec -						   start_tasks.tv_usec) /	1000000.0;      /* Wait for the asynchronous calls to complete.  */#pragma starpu wait      gettimeofday (&end_compute, NULL);      computetime[nloop] =	(end_compute.tv_sec - start_compute.tv_sec) + (end_compute.tv_usec -						       start_compute.						       tv_usec) / 1000000.0;    }#if 0  print_matrix (C, nslicesx, nslicesy, bxdim, bydim);#endif  gettimeofday (&start_unregister, NULL);  for (i = 0; i < nslicesy; i++)    for (j = 0; j < nslicesz; j++)#pragma starpu unregister &A[i*zdim*bydim + j*bzdim*bydim]  for (i = 0; i < nslicesz; i++)    for (j = 0; j < nslicesx; j++)#pragma starpu unregister &B[i*xdim*bzdim + j*bxdim*bzdim]  for (i = 0; i < nslicesy; i++)    for (j = 0; j < nslicesx; j++)#pragma starpu unregister &C[i*xdim*bydim + j*bxdim*bydim]  gettimeofday (&end_unregister, NULL);  gettimeofday (&end_all, NULL);#pragma starpu shutdown  printf ("total: %f\n",	  (end_all.tv_sec - start_all.tv_sec) + (end_all.tv_usec -						 start_all.tv_usec) /	  1000000.0);  printf ("register: %f\n",	  (end_register.tv_sec - start_register.tv_sec) +	  (end_register.tv_usec - start_register.tv_usec) / 1000000.0);  printf ("unregister: %f\n",	  (end_unregister.tv_sec - start_unregister.tv_sec) +	  (end_unregister.tv_usec - start_unregister.tv_usec) / 1000000.0);  printf ("mean task launch : %f\n", mean (starttaskstime, mloop));  printf ("std task launch: %f\n", stddev (starttaskstime, mloop));  printf ("mean compute: %f\n", mean (computetime, mloop));  printf ("std compute: %f\n", stddev (computetime, mloop));  printf ("Compute performance : %f GFLOPS\n",	  .002 * xdim * ydim * zdim / (mean (computetime, mloop) * 1000000));  return EXIT_SUCCESS;}
 |