| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195 | /* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2012 INRIA * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. *//* This example showcases features of the StarPU GCC plug-in.  It defines a   "vector scaling" task with multiple CPU implementations, an OpenCL   implementation, and a CUDA implementation.   Compiling it without `-fplugin=starpu.so' yields valid sequential code.  */#include <math.h>#include <stdbool.h>#include <stdio.h>#include <stdlib.h>/* Declare and define the standard CPU implementation.  */static void vector_scal (unsigned int size, float vector[size], float factor)  __attribute__ ((task));/* The CPU implementation.  */static voidvector_scal (unsigned int size, float vector[size], float factor){  unsigned int i;  for (i = 0; i < size; i++)    vector[i] *= factor;}#if defined STARPU_GCC_PLUGIN && defined __SSE__/* The SSE-capable CPU implementation.  */#include <xmmintrin.h>static void vector_scal_sse (unsigned int size, float vector[size], float factor)  __attribute__ ((task_implementation ("cpu", vector_scal)));static voidvector_scal_sse (unsigned int size, float vector[size], float factor){  unsigned int n_iterations = size / 4;  __m128 *VECTOR = (__m128 *) vector;  __m128 _FACTOR STARPU_ATTRIBUTE_ALIGNED(16);  _FACTOR = _mm_set1_ps (factor);  unsigned int i;  for (i = 0; i < n_iterations; i++)    VECTOR[i] = _mm_mul_ps (_FACTOR, VECTOR[i]);  unsigned int remainder = size % 4;  if (remainder != 0)    {      unsigned int start = 4 * n_iterations;      for (i = start; i < start + remainder; ++i)	vector[i] = factor * vector[i];    }}#endif /* __SSE__ *//* Declaration and definition of the OpenCL implementation.  */#if defined STARPU_GCC_PLUGIN && defined STARPU_USE_OPENCL#include <starpu_opencl.h>/* The OpenCL programs, loaded from `main'.  */static struct starpu_opencl_program cl_programs;static void vector_scal_opencl (unsigned int size, float vector[size], float factor)  __attribute__ ((task_implementation ("opencl", vector_scal)));static voidvector_scal_opencl (unsigned int size, float vector[size], float factor){  int id, devid, err;  cl_kernel kernel;  cl_command_queue queue;  cl_event event;  cl_mem val = (cl_mem) vector;  id = starpu_worker_get_id ();  devid = starpu_worker_get_devid (id);  /* Prepare to invoke the kernel.  In the future, this will be largely     automated.  */  err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs,				   "vector_mult_opencl", devid);  if (err != CL_SUCCESS)    STARPU_OPENCL_REPORT_ERROR (err);  err = clSetKernelArg (kernel, 0, sizeof (size), &size);  err |= clSetKernelArg (kernel, 1, sizeof (val), &val);  err |= clSetKernelArg (kernel, 2, sizeof (factor), &factor);  if (err)    STARPU_OPENCL_REPORT_ERROR (err);  size_t global = size, local = 1;  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global, &local, 0,				NULL, &event);  if (err != CL_SUCCESS)    STARPU_OPENCL_REPORT_ERROR (err);  clFinish (queue);  starpu_opencl_collect_stats (event);  clReleaseEvent (event);  starpu_opencl_release_kernel (kernel);}#endif#ifdef STARPU_USE_CUDA/* Declaration of the CUDA implementation.  The definition itself is in the   `.cu' file itself.  */extern void vector_scal_cuda (unsigned int size, float vector[size], float factor)  __attribute__ ((task_implementation ("cuda", vector_scal)));#endif#define EPSILON 1e-3static boolcheck (unsigned int size, float vector[size], float factor){  unsigned int i;  for (i = 0; i < size; i++)    {      if (fabs(vector[i] - i * factor) > i*factor*EPSILON)        {          fprintf(stderr, "%.2f != %.2f\n", vector[i], i*factor);          return false;        }    }  fprintf(stderr, "computation is correct\n");  return true;}intmain (void){  bool valid;#pragma starpu initialize#if defined STARPU_GCC_PLUGIN && defined STARPU_USE_OPENCL  starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",				       &cl_programs, "");#endif#define NX     0x100000#define FACTOR 3.14  {    float vector[NX] __attribute__ ((heap_allocated, registered));    unsigned int i;    for (i = 0; i < NX; i++)      vector[i] = (float) i;    vector_scal (NX, vector, FACTOR);#pragma starpu wait#pragma starpu acquire vector    valid = check (NX, vector, FACTOR);#pragma starpu release vector  } /* VECTOR is automatically freed here.  */#pragma starpu shutdown  return valid ? EXIT_SUCCESS : EXIT_FAILURE;}
 |